[gentoo-commits] proj/linux-patches:4.3 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.3 commit in: /
Date:	Fri, 06 Nov 2015 00:24:32
Message-Id:	`1446769452.2f6cc0b28617fe4c94729933e6aef823e8cd9773.mpagano@gentoo`

1

commit:     2f6cc0b28617fe4c94729933e6aef823e8cd9773

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Fri Nov  6 00:24:12 2015 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Fri Nov  6 00:24:12 2015 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=2f6cc0b2

7

8

BFQ Patches v7r8.

9

10

 0000_README                                        |   13 +

11

 ...roups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch |  104 +

12

 ...introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1 | 6952 ++++++++++++++++++++

13

 ...Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch | 1220 ++++

14

 4 files changed, 8289 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index 8e70e78..4c2a487 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -71,6 +71,19 @@ Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

21

 From:   https://github.com/graysky2/kernel_gcc_patch/

22

 Desc:   Kernel patch enables gcc < v4.9 optimizations for additional CPUs.

23

24

+Patch:  5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch

25

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

26

+Desc:   BFQ v7r8 patch 1 for 4.3: Build, cgroups and kconfig bits

27

+

28

+Patch:  5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1

29

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

+Desc:   BFQ v7r8 patch 2 for 4.3: BFQ Scheduler

31

+

32

+Patch:  5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch

33

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

34

+Desc:   BFQ v7r8 patch 3 for 4.3: Early Queue Merge (EQM)

35

+

36

 Patch:  5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch

37

 From:   https://github.com/graysky2/kernel_gcc_patch/

38

 Desc:   Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.

39

+

40

41

diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch

42

new file mode 100644

43

index 0000000..76440b8

44

--- /dev/null

45

+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch

46

@@ -0,0 +1,104 @@

47

+From 6a88d12f19b7c5578cf5d17a5e61fb0af75fa0d7 Mon Sep 17 00:00:00 2001

48

+From: Paolo Valente <paolo.valente@×××××××.it>

49

+Date: Tue, 7 Apr 2015 13:39:12 +0200

50

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r8-4.3

51

+

52

+Update Kconfig.iosched and do the related Makefile changes to include

53

+kernel configuration options for BFQ. Also add the bfqio controller

54

+to the cgroups subsystem.

55

+

56

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

57

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

58

+---

59

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

60

+ block/Makefile                |  1 +

61

+ include/linux/cgroup_subsys.h |  4 ++++

62

+ 3 files changed, 37 insertions(+)

63

+

64

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

65

+index 421bef9..0ee5f0f 100644

66

+--- a/block/Kconfig.iosched

67

++++ b/block/Kconfig.iosched

68

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

69

+ 	---help---

70

+ 	  Enable group IO scheduling in CFQ.

71

+

72

++config IOSCHED_BFQ

73

++	tristate "BFQ I/O scheduler"

74

++	default n

75

++	---help---

76

++	  The BFQ I/O scheduler tries to distribute bandwidth among

77

++	  all processes according to their weights.

78

++	  It aims at distributing the bandwidth as desired, independently of

79

++	  the disk parameters and with any workload. It also tries to

80

++	  guarantee low latency to interactive and soft real-time

81

++	  applications. If compiled built-in (saying Y here), BFQ can

82

++	  be configured to support hierarchical scheduling.

83

++

84

++config CGROUP_BFQIO

85

++	bool "BFQ hierarchical scheduling support"

86

++	depends on CGROUPS && IOSCHED_BFQ=y

87

++	default n

88

++	---help---

89

++	  Enable hierarchical scheduling in BFQ, using the cgroups

90

++	  filesystem interface.  The name of the subsystem will be

91

++	  bfqio.

92

++

93

+ choice

94

+ 	prompt "Default I/O scheduler"

95

+ 	default DEFAULT_CFQ

96

+@@ -52,6 +73,16 @@ choice

97

+ 	config DEFAULT_CFQ

98

+ 		bool "CFQ" if IOSCHED_CFQ=y

99

+

100

++	config DEFAULT_BFQ

101

++		bool "BFQ" if IOSCHED_BFQ=y

102

++		help

103

++		  Selects BFQ as the default I/O scheduler which will be

104

++		  used by default for all block devices.

105

++		  The BFQ I/O scheduler aims at distributing the bandwidth

106

++		  as desired, independently of the disk parameters and with

107

++		  any workload. It also tries to guarantee low latency to

108

++		  interactive and soft real-time applications.

109

++

110

+ 	config DEFAULT_NOOP

111

+ 		bool "No-op"

112

+

113

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

114

+ 	string

115

+ 	default "deadline" if DEFAULT_DEADLINE

116

+ 	default "cfq" if DEFAULT_CFQ

117

++	default "bfq" if DEFAULT_BFQ

118

+ 	default "noop" if DEFAULT_NOOP

119

+

120

+ endmenu

121

+diff --git a/block/Makefile b/block/Makefile

122

+index 00ecc97..1ed86d5 100644

123

+--- a/block/Makefile

124

++++ b/block/Makefile

125

+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

126

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

127

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

128

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

129

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

130

+

131

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

132

+ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o

133

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

134

+index 1a96fda..81ad8a0 100644

135

+--- a/include/linux/cgroup_subsys.h

136

++++ b/include/linux/cgroup_subsys.h

137

+@@ -46,6 +46,10 @@ SUBSYS(freezer)

138

+ SUBSYS(net_cls)

139

+ #endif

140

+

141

++#if IS_ENABLED(CONFIG_CGROUP_BFQIO)

142

++SUBSYS(bfqio)

143

++#endif

144

++

145

+ #if IS_ENABLED(CONFIG_CGROUP_PERF)

146

+ SUBSYS(perf_event)

147

+ #endif

148

+--

149

+1.9.1

150

+

151

152

diff --git a/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1 b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1

153

new file mode 100644

154

index 0000000..43196b2

155

--- /dev/null

156

+++ b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1

157

@@ -0,0 +1,6952 @@

158

+From ec474da4d0e3f9eb7860496802b6693333687bb5 Mon Sep 17 00:00:00 2001

159

+From: Paolo Valente <paolo.valente@×××××××.it>

160

+Date: Thu, 9 May 2013 19:10:02 +0200

161

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r8 I/O sched for 4.3

162

+

163

+Add the BFQ-v7r8 I/O scheduler to 4.3.

164

+The general structure is borrowed from CFQ, as much of the code for

165

+handling I/O contexts. Over time, several useful features have been

166

+ported from CFQ as well (details in the changelog in README.BFQ). A

167

+(bfq_)queue is associated to each task doing I/O on a device, and each

168

+time a scheduling decision has to be made a queue is selected and served

169

+until it expires.

170

+

171

+    - Slices are given in the service domain: tasks are assigned

172

+      budgets, measured in number of sectors. Once got the disk, a task

173

+      must however consume its assigned budget within a configurable

174

+      maximum time (by default, the maximum possible value of the

175

+      budgets is automatically computed to comply with this timeout).

176

+      This allows the desired latency vs "throughput boosting" tradeoff

177

+      to be set.

178

+

179

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

180

+      using an augmented rb-tree to take eligibility into account while

181

+      preserving an O(log N) overall complexity.

182

+

183

+    - A low-latency tunable is provided; if enabled, both interactive

184

+      and soft real-time applications are guaranteed a very low latency.

185

+

186

+    - Latency guarantees are preserved also in the presence of NCQ.

187

+

188

+    - Also with flash-based devices, a high throughput is achieved

189

+      while still preserving latency guarantees.

190

+

191

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

192

+      cooperating-queue-merging and the preemption mechanisms present

193

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

194

+      sequential read pattern, and hence a high throughput, with any

195

+      set of processes performing interleaved I/O over a contiguous

196

+      sequence of sectors.

197

+

198

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

199

+      interface.  Since each node has a full scheduler, each group can

200

+      be assigned its own weight.

201

+

202

+    - If the cgroups interface is not used, only I/O priorities can be

203

+      assigned to processes, with ioprio values mapped to weights

204

+      with the relation weight = IOPRIO_BE_NR - ioprio.

205

+

206

+    - ioprio classes are served in strict priority order, i.e., lower

207

+      priority queues are not served as long as there are higher

208

+      priority queues.  Among queues in the same class the bandwidth is

209

+      distributed in proportion to the weight of each queue. A very

210

+      thin extra bandwidth is however guaranteed to the Idle class, to

211

+      prevent it from starving.

212

+

213

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

214

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

215

+---

216

+ block/bfq-cgroup.c  |  936 +++++++++++++

217

+ block/bfq-ioc.c     |   36 +

218

+ block/bfq-iosched.c | 3898 +++++++++++++++++++++++++++++++++++++++++++++++++++

219

+ block/bfq-sched.c   | 1208 ++++++++++++++++

220

+ block/bfq.h         |  771 ++++++++++

221

+ 5 files changed, 6849 insertions(+)

222

+ create mode 100644 block/bfq-cgroup.c

223

+ create mode 100644 block/bfq-ioc.c

224

+ create mode 100644 block/bfq-iosched.c

225

+ create mode 100644 block/bfq-sched.c

226

+ create mode 100644 block/bfq.h

227

+

228

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

229

+new file mode 100644

230

+index 0000000..11e2f1d

231

+--- /dev/null

232

++++ b/block/bfq-cgroup.c

233

+@@ -0,0 +1,936 @@

234

++/*

235

++ * BFQ: CGROUPS support.

236

++ *

237

++ * Based on ideas and code from CFQ:

238

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

239

++ *

240

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

241

++ *		      Paolo Valente <paolo.valente@×××××××.it>

242

++ *

243

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

244

++ *

245

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

246

++ * file.

247

++ */

248

++

249

++#ifdef CONFIG_CGROUP_BFQIO

250

++

251

++static DEFINE_MUTEX(bfqio_mutex);

252

++

253

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

254

++{

255

++	return bgrp ? !bgrp->online : false;

256

++}

257

++

258

++static struct bfqio_cgroup bfqio_root_cgroup = {

259

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

260

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

261

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

262

++};

263

++

264

++static inline void bfq_init_entity(struct bfq_entity *entity,

265

++				   struct bfq_group *bfqg)

266

++{

267

++	entity->weight = entity->new_weight;

268

++	entity->orig_weight = entity->new_weight;

269

++	entity->ioprio = entity->new_ioprio;

270

++	entity->ioprio_class = entity->new_ioprio_class;

271

++	entity->parent = bfqg->my_entity;

272

++	entity->sched_data = &bfqg->sched_data;

273

++}

274

++

275

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

276

++{

277

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

278

++}

279

++

280

++/*

281

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

282

++ * of bgrp.  Must be called under rcu_read_lock().

283

++ */

284

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

285

++					    struct bfq_data *bfqd)

286

++{

287

++	struct bfq_group *bfqg;

288

++	void *key;

289

++

290

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

291

++		key = rcu_dereference(bfqg->bfqd);

292

++		if (key == bfqd)

293

++			return bfqg;

294

++	}

295

++

296

++	return NULL;

297

++}

298

++

299

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

300

++					 struct bfq_group *bfqg)

301

++{

302

++	struct bfq_entity *entity = &bfqg->entity;

303

++

304

++	/*

305

++	 * If the weight of the entity has never been set via the sysfs

306

++	 * interface, then bgrp->weight == 0. In this case we initialize

307

++	 * the weight from the current ioprio value. Otherwise, the group

308

++	 * weight, if set, has priority over the ioprio value.

309

++	 */

310

++	if (bgrp->weight == 0) {

311

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

312

++		entity->new_ioprio = bgrp->ioprio;

313

++	} else {

314

++		if (bgrp->weight < BFQ_MIN_WEIGHT ||

315

++		    bgrp->weight > BFQ_MAX_WEIGHT) {

316

++			printk(KERN_CRIT "bfq_group_init_entity: "

317

++					 "bgrp->weight %d\n", bgrp->weight);

318

++			BUG();

319

++		}

320

++		entity->new_weight = bgrp->weight;

321

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

322

++	}

323

++	entity->orig_weight = entity->weight = entity->new_weight;

324

++	entity->ioprio = entity->new_ioprio;

325

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

326

++	entity->my_sched_data = &bfqg->sched_data;

327

++	bfqg->active_entities = 0;

328

++}

329

++

330

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

331

++					struct bfq_group *parent)

332

++{

333

++	struct bfq_entity *entity;

334

++

335

++	BUG_ON(parent == NULL);

336

++	BUG_ON(bfqg == NULL);

337

++

338

++	entity = &bfqg->entity;

339

++	entity->parent = parent->my_entity;

340

++	entity->sched_data = &parent->sched_data;

341

++}

342

++

343

++/**

344

++ * bfq_group_chain_alloc - allocate a chain of groups.

345

++ * @bfqd: queue descriptor.

346

++ * @css: the leaf cgroup_subsys_state this chain starts from.

347

++ *

348

++ * Allocate a chain of groups starting from the one belonging to

349

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

350

++ * to the root has already an allocated group on @bfqd.

351

++ */

352

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

353

++					       struct cgroup_subsys_state *css)

354

++{

355

++	struct bfqio_cgroup *bgrp;

356

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

357

++

358

++	for (; css != NULL; css = css->parent) {

359

++		bgrp = css_to_bfqio(css);

360

++

361

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

362

++		if (bfqg != NULL) {

363

++			/*

364

++			 * All the cgroups in the path from there to the

365

++			 * root must have a bfq_group for bfqd, so we don't

366

++			 * need any more allocations.

367

++			 */

368

++			break;

369

++		}

370

++

371

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

372

++		if (bfqg == NULL)

373

++			goto cleanup;

374

++

375

++		bfq_group_init_entity(bgrp, bfqg);

376

++		bfqg->my_entity = &bfqg->entity;

377

++

378

++		if (leaf == NULL) {

379

++			leaf = bfqg;

380

++			prev = leaf;

381

++		} else {

382

++			bfq_group_set_parent(prev, bfqg);

383

++			/*

384

++			 * Build a list of allocated nodes using the bfqd

385

++			 * filed, that is still unused and will be

386

++			 * initialized only after the node will be

387

++			 * connected.

388

++			 */

389

++			prev->bfqd = bfqg;

390

++			prev = bfqg;

391

++		}

392

++	}

393

++

394

++	return leaf;

395

++

396

++cleanup:

397

++	while (leaf != NULL) {

398

++		prev = leaf;

399

++		leaf = leaf->bfqd;

400

++		kfree(prev);

401

++	}

402

++

403

++	return NULL;

404

++}

405

++

406

++/**

407

++ * bfq_group_chain_link - link an allocated group chain to a cgroup

408

++ *                        hierarchy.

409

++ * @bfqd: the queue descriptor.

410

++ * @css: the leaf cgroup_subsys_state to start from.

411

++ * @leaf: the leaf group (to be associated to @cgroup).

412

++ *

413

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

414

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

415

++ * hierarchy that already as a group associated to @bfqd all the nodes

416

++ * in the path to the root cgroup have one too.

417

++ *

418

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

419

++ * per device) while the bfqio_cgroup lock protects the list of groups

420

++ * belonging to the same cgroup.

421

++ */

422

++static void bfq_group_chain_link(struct bfq_data *bfqd,

423

++				 struct cgroup_subsys_state *css,

424

++				 struct bfq_group *leaf)

425

++{

426

++	struct bfqio_cgroup *bgrp;

427

++	struct bfq_group *bfqg, *next, *prev = NULL;

428

++	unsigned long flags;

429

++

430

++	assert_spin_locked(bfqd->queue->queue_lock);

431

++

432

++	for (; css != NULL && leaf != NULL; css = css->parent) {

433

++		bgrp = css_to_bfqio(css);

434

++		next = leaf->bfqd;

435

++

436

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

437

++		BUG_ON(bfqg != NULL);

438

++

439

++		spin_lock_irqsave(&bgrp->lock, flags);

440

++

441

++		rcu_assign_pointer(leaf->bfqd, bfqd);

442

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

443

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

444

++

445

++		spin_unlock_irqrestore(&bgrp->lock, flags);

446

++

447

++		prev = leaf;

448

++		leaf = next;

449

++	}

450

++

451

++	BUG_ON(css == NULL && leaf != NULL);

452

++	if (css != NULL && prev != NULL) {

453

++		bgrp = css_to_bfqio(css);

454

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

455

++		bfq_group_set_parent(prev, bfqg);

456

++	}

457

++}

458

++

459

++/**

460

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

461

++ * @bfqd: queue descriptor.

462

++ * @cgroup: cgroup being searched for.

463

++ *

464

++ * Return a group associated to @bfqd in @cgroup, allocating one if

465

++ * necessary.  When a group is returned all the cgroups in the path

466

++ * to the root have a group associated to @bfqd.

467

++ *

468

++ * If the allocation fails, return the root group: this breaks guarantees

469

++ * but is a safe fallback.  If this loss becomes a problem it can be

470

++ * mitigated using the equivalent weight (given by the product of the

471

++ * weights of the groups in the path from @group to the root) in the

472

++ * root scheduler.

473

++ *

474

++ * We allocate all the missing nodes in the path from the leaf cgroup

475

++ * to the root and we connect the nodes only after all the allocations

476

++ * have been successful.

477

++ */

478

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

479

++					      struct cgroup_subsys_state *css)

480

++{

481

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

482

++	struct bfq_group *bfqg;

483

++

484

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

485

++	if (bfqg != NULL)

486

++		return bfqg;

487

++

488

++	bfqg = bfq_group_chain_alloc(bfqd, css);

489

++	if (bfqg != NULL)

490

++		bfq_group_chain_link(bfqd, css, bfqg);

491

++	else

492

++		bfqg = bfqd->root_group;

493

++

494

++	return bfqg;

495

++}

496

++

497

++/**

498

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

499

++ * @bfqd: queue descriptor.

500

++ * @bfqq: the queue to move.

501

++ * @entity: @bfqq's entity.

502

++ * @bfqg: the group to move to.

503

++ *

504

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

505

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

506

++ *

507

++ * Must be called under the queue lock; the cgroup owning @bfqg must

508

++ * not disappear (by now this just means that we are called under

509

++ * rcu_read_lock()).

510

++ */

511

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

512

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

513

++{

514

++	int busy, resume;

515

++

516

++	busy = bfq_bfqq_busy(bfqq);

517

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

518

++

519

++	BUG_ON(resume && !entity->on_st);

520

++	BUG_ON(busy && !resume && entity->on_st &&

521

++	       bfqq != bfqd->in_service_queue);

522

++

523

++	if (busy) {

524

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

525

++

526

++		if (!resume)

527

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

528

++		else

529

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

530

++	} else if (entity->on_st)

531

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

532

++

533

++	/*

534

++	 * Here we use a reference to bfqg.  We don't need a refcounter

535

++	 * as the cgroup reference will not be dropped, so that its

536

++	 * destroy() callback will not be invoked.

537

++	 */

538

++	entity->parent = bfqg->my_entity;

539

++	entity->sched_data = &bfqg->sched_data;

540

++

541

++	if (busy && resume)

542

++		bfq_activate_bfqq(bfqd, bfqq);

543

++

544

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

545

++		bfq_schedule_dispatch(bfqd);

546

++}

547

++

548

++/**

549

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

550

++ * @bfqd: the queue descriptor.

551

++ * @bic: the bic to move.

552

++ * @cgroup: the cgroup to move to.

553

++ *

554

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

555

++ * has to make sure that the reference to cgroup is valid across the call.

556

++ *

557

++ * NOTE: an alternative approach might have been to store the current

558

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

559

++ * time here, at the price of slightly more complex code.

560

++ */

561

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

562

++						struct bfq_io_cq *bic,

563

++						struct cgroup_subsys_state *css)

564

++{

565

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

566

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

567

++	struct bfq_entity *entity;

568

++	struct bfq_group *bfqg;

569

++	struct bfqio_cgroup *bgrp;

570

++

571

++	bgrp = css_to_bfqio(css);

572

++

573

++	bfqg = bfq_find_alloc_group(bfqd, css);

574

++	if (async_bfqq != NULL) {

575

++		entity = &async_bfqq->entity;

576

++

577

++		if (entity->sched_data != &bfqg->sched_data) {

578

++			bic_set_bfqq(bic, NULL, 0);

579

++			bfq_log_bfqq(bfqd, async_bfqq,

580

++				     "bic_change_group: %p %d",

581

++				     async_bfqq, atomic_read(&async_bfqq->ref));

582

++			bfq_put_queue(async_bfqq);

583

++		}

584

++	}

585

++

586

++	if (sync_bfqq != NULL) {

587

++		entity = &sync_bfqq->entity;

588

++		if (entity->sched_data != &bfqg->sched_data)

589

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

590

++	}

591

++

592

++	return bfqg;

593

++}

594

++

595

++/**

596

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

597

++ * @bic: the bic being migrated.

598

++ * @cgroup: the destination cgroup.

599

++ *

600

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

601

++ * moved into its new parent group.

602

++ */

603

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

604

++				  struct cgroup_subsys_state *css)

605

++{

606

++	struct bfq_data *bfqd;

607

++	unsigned long uninitialized_var(flags);

608

++

609

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

610

++				   &flags);

611

++	if (bfqd != NULL) {

612

++		__bfq_bic_change_cgroup(bfqd, bic, css);

613

++		bfq_put_bfqd_unlock(bfqd, &flags);

614

++	}

615

++}

616

++

617

++/**

618

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

619

++ * @bic: the @bic to update.

620

++ *

621

++ * Make sure that @bic is enqueued in the cgroup of the current task.

622

++ * We need this in addition to moving bics during the cgroup attach

623

++ * phase because the task owning @bic could be at its first disk

624

++ * access or we may end up in the root cgroup as the result of a

625

++ * memory allocation failure and here we try to move to the right

626

++ * group.

627

++ *

628

++ * Must be called under the queue lock.  It is safe to use the returned

629

++ * value even after the rcu_read_unlock() as the migration/destruction

630

++ * paths act under the queue lock too.  IOW it is impossible to race with

631

++ * group migration/destruction and end up with an invalid group as:

632

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

633

++ *      has started execution, as current holds a reference to it,

634

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

635

++ *      migrated to a different cgroup] its attach() callback will have

636

++ *      taken care of remove all the references to the old cgroup data.

637

++ */

638

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

639

++{

640

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

641

++	struct bfq_group *bfqg;

642

++	struct cgroup_subsys_state *css;

643

++

644

++	BUG_ON(bfqd == NULL);

645

++

646

++	rcu_read_lock();

647

++	css = task_css(current, bfqio_cgrp_id);

648

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

649

++	rcu_read_unlock();

650

++

651

++	return bfqg;

652

++}

653

++

654

++/**

655

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

656

++ * @st: the service tree being flushed.

657

++ */

658

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

659

++{

660

++	struct bfq_entity *entity = st->first_idle;

661

++

662

++	for (; entity != NULL; entity = st->first_idle)

663

++		__bfq_deactivate_entity(entity, 0);

664

++}

665

++

666

++/**

667

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

668

++ * @bfqd: the device data structure with the root group.

669

++ * @entity: the entity to move.

670

++ */

671

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

672

++					    struct bfq_entity *entity)

673

++{

674

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

675

++

676

++	BUG_ON(bfqq == NULL);

677

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

678

++	return;

679

++}

680

++

681

++/**

682

++ * bfq_reparent_active_entities - move to the root group all active

683

++ *                                entities.

684

++ * @bfqd: the device data structure with the root group.

685

++ * @bfqg: the group to move from.

686

++ * @st: the service tree with the entities.

687

++ *

688

++ * Needs queue_lock to be taken and reference to be valid over the call.

689

++ */

690

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

691

++						struct bfq_group *bfqg,

692

++						struct bfq_service_tree *st)

693

++{

694

++	struct rb_root *active = &st->active;

695

++	struct bfq_entity *entity = NULL;

696

++

697

++	if (!RB_EMPTY_ROOT(&st->active))

698

++		entity = bfq_entity_of(rb_first(active));

699

++

700

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

701

++		bfq_reparent_leaf_entity(bfqd, entity);

702

++

703

++	if (bfqg->sched_data.in_service_entity != NULL)

704

++		bfq_reparent_leaf_entity(bfqd,

705

++			bfqg->sched_data.in_service_entity);

706

++

707

++	return;

708

++}

709

++

710

++/**

711

++ * bfq_destroy_group - destroy @bfqg.

712

++ * @bgrp: the bfqio_cgroup containing @bfqg.

713

++ * @bfqg: the group being destroyed.

714

++ *

715

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

716

++ */

717

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

718

++{

719

++	struct bfq_data *bfqd;

720

++	struct bfq_service_tree *st;

721

++	struct bfq_entity *entity = bfqg->my_entity;

722

++	unsigned long uninitialized_var(flags);

723

++	int i;

724

++

725

++	hlist_del(&bfqg->group_node);

726

++

727

++	/*

728

++	 * Empty all service_trees belonging to this group before

729

++	 * deactivating the group itself.

730

++	 */

731

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

732

++		st = bfqg->sched_data.service_tree + i;

733

++

734

++		/*

735

++		 * The idle tree may still contain bfq_queues belonging

736

++		 * to exited task because they never migrated to a different

737

++		 * cgroup from the one being destroyed now.  No one else

738

++		 * can access them so it's safe to act without any lock.

739

++		 */

740

++		bfq_flush_idle_tree(st);

741

++

742

++		/*

743

++		 * It may happen that some queues are still active

744

++		 * (busy) upon group destruction (if the corresponding

745

++		 * processes have been forced to terminate). We move

746

++		 * all the leaf entities corresponding to these queues

747

++		 * to the root_group.

748

++		 * Also, it may happen that the group has an entity

749

++		 * in service, which is disconnected from the active

750

++		 * tree: it must be moved, too.

751

++		 * There is no need to put the sync queues, as the

752

++		 * scheduler has taken no reference.

753

++		 */

754

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

755

++		if (bfqd != NULL) {

756

++			bfq_reparent_active_entities(bfqd, bfqg, st);

757

++			bfq_put_bfqd_unlock(bfqd, &flags);

758

++		}

759

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

760

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

761

++	}

762

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

763

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

764

++

765

++	/*

766

++	 * We may race with device destruction, take extra care when

767

++	 * dereferencing bfqg->bfqd.

768

++	 */

769

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

770

++	if (bfqd != NULL) {

771

++		hlist_del(&bfqg->bfqd_node);

772

++		__bfq_deactivate_entity(entity, 0);

773

++		bfq_put_async_queues(bfqd, bfqg);

774

++		bfq_put_bfqd_unlock(bfqd, &flags);

775

++	}

776

++	BUG_ON(entity->tree != NULL);

777

++

778

++	/*

779

++	 * No need to defer the kfree() to the end of the RCU grace

780

++	 * period: we are called from the destroy() callback of our

781

++	 * cgroup, so we can be sure that no one is a) still using

782

++	 * this cgroup or b) doing lookups in it.

783

++	 */

784

++	kfree(bfqg);

785

++}

786

++

787

++static void bfq_end_wr_async(struct bfq_data *bfqd)

788

++{

789

++	struct hlist_node *tmp;

790

++	struct bfq_group *bfqg;

791

++

792

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

793

++		bfq_end_wr_async_queues(bfqd, bfqg);

794

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

795

++}

796

++

797

++/**

798

++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.

799

++ * @bfqd: the device descriptor being exited.

800

++ *

801

++ * When the device exits we just make sure that no lookup can return

802

++ * the now unused group structures.  They will be deallocated on cgroup

803

++ * destruction.

804

++ */

805

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

806

++{

807

++	struct hlist_node *tmp;

808

++	struct bfq_group *bfqg;

809

++

810

++	bfq_log(bfqd, "disconnect_groups beginning");

811

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

812

++		hlist_del(&bfqg->bfqd_node);

813

++

814

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

815

++

816

++		/*

817

++		 * Don't remove from the group hash, just set an

818

++		 * invalid key.  No lookups can race with the

819

++		 * assignment as bfqd is being destroyed; this

820

++		 * implies also that new elements cannot be added

821

++		 * to the list.

822

++		 */

823

++		rcu_assign_pointer(bfqg->bfqd, NULL);

824

++

825

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

826

++			bfqg);

827

++		bfq_put_async_queues(bfqd, bfqg);

828

++	}

829

++}

830

++

831

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

832

++{

833

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

834

++	struct bfq_group *bfqg = bfqd->root_group;

835

++

836

++	bfq_put_async_queues(bfqd, bfqg);

837

++

838

++	spin_lock_irq(&bgrp->lock);

839

++	hlist_del_rcu(&bfqg->group_node);

840

++	spin_unlock_irq(&bgrp->lock);

841

++

842

++	/*

843

++	 * No need to synchronize_rcu() here: since the device is gone

844

++	 * there cannot be any read-side access to its root_group.

845

++	 */

846

++	kfree(bfqg);

847

++}

848

++

849

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

850

++{

851

++	struct bfq_group *bfqg;

852

++	struct bfqio_cgroup *bgrp;

853

++	int i;

854

++

855

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

856

++	if (bfqg == NULL)

857

++		return NULL;

858

++

859

++	bfqg->entity.parent = NULL;

860

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

861

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

862

++

863

++	bgrp = &bfqio_root_cgroup;

864

++	spin_lock_irq(&bgrp->lock);

865

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

866

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

867

++	spin_unlock_irq(&bgrp->lock);

868

++

869

++	return bfqg;

870

++}

871

++

872

++#define SHOW_FUNCTION(__VAR)						\

873

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

874

++				       struct cftype *cftype)		\

875

++{									\

876

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

877

++	u64 ret = -ENODEV;						\

878

++									\

879

++	mutex_lock(&bfqio_mutex);					\

880

++	if (bfqio_is_removed(bgrp))					\

881

++		goto out_unlock;					\

882

++									\

883

++	spin_lock_irq(&bgrp->lock);					\

884

++	ret = bgrp->__VAR;						\

885

++	spin_unlock_irq(&bgrp->lock);					\

886

++									\

887

++out_unlock:								\

888

++	mutex_unlock(&bfqio_mutex);					\

889

++	return ret;							\

890

++}

891

++

892

++SHOW_FUNCTION(weight);

893

++SHOW_FUNCTION(ioprio);

894

++SHOW_FUNCTION(ioprio_class);

895

++#undef SHOW_FUNCTION

896

++

897

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

898

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

899

++					struct cftype *cftype,		\

900

++					u64 val)			\

901

++{									\

902

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

903

++	struct bfq_group *bfqg;						\

904

++	int ret = -EINVAL;						\

905

++									\

906

++	if (val < (__MIN) || val > (__MAX))				\

907

++		return ret;						\

908

++									\

909

++	ret = -ENODEV;							\

910

++	mutex_lock(&bfqio_mutex);					\

911

++	if (bfqio_is_removed(bgrp))					\

912

++		goto out_unlock;					\

913

++	ret = 0;							\

914

++									\

915

++	spin_lock_irq(&bgrp->lock);					\

916

++	bgrp->__VAR = (unsigned short)val;				\

917

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

918

++		/*							\

919

++		 * Setting the ioprio_changed flag of the entity        \

920

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

921

++		 * the value of the weight to its ioprio mapping.       \

922

++		 * Set the flag only if necessary.			\

923

++		 */							\

924

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

925

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

926

++			/*						\

927

++			 * Make sure that the above new value has been	\

928

++			 * stored in bfqg->entity.new_##__VAR before	\

929

++			 * setting the ioprio_changed flag. In fact,	\

930

++			 * this flag may be read asynchronously (in	\

931

++			 * critical sections protected by a different	\

932

++			 * lock than that held here), and finding this	\

933

++			 * flag set may cause the execution of the code	\

934

++			 * for updating parameters whose value may	\

935

++			 * depend also on bfqg->entity.new_##__VAR (in	\

936

++			 * __bfq_entity_update_weight_prio).		\

937

++			 * This barrier makes sure that the new value	\

938

++			 * of bfqg->entity.new_##__VAR is correctly	\

939

++			 * seen in that code.				\

940

++			 */						\

941

++			smp_wmb();                                      \

942

++			bfqg->entity.ioprio_changed = 1;                \

943

++		}							\

944

++	}								\

945

++	spin_unlock_irq(&bgrp->lock);					\

946

++									\

947

++out_unlock:								\

948

++	mutex_unlock(&bfqio_mutex);					\

949

++	return ret;							\

950

++}

951

++

952

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

953

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

954

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

955

++#undef STORE_FUNCTION

956

++

957

++static struct cftype bfqio_files[] = {

958

++	{

959

++		.name = "weight",

960

++		.read_u64 = bfqio_cgroup_weight_read,

961

++		.write_u64 = bfqio_cgroup_weight_write,

962

++	},

963

++	{

964

++		.name = "ioprio",

965

++		.read_u64 = bfqio_cgroup_ioprio_read,

966

++		.write_u64 = bfqio_cgroup_ioprio_write,

967

++	},

968

++	{

969

++		.name = "ioprio_class",

970

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

971

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

972

++	},

973

++	{ },	/* terminate */

974

++};

975

++

976

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

977

++						*parent_css)

978

++{

979

++	struct bfqio_cgroup *bgrp;

980

++

981

++	if (parent_css != NULL) {

982

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

983

++		if (bgrp == NULL)

984

++			return ERR_PTR(-ENOMEM);

985

++	} else

986

++		bgrp = &bfqio_root_cgroup;

987

++

988

++	spin_lock_init(&bgrp->lock);

989

++	INIT_HLIST_HEAD(&bgrp->group_data);

990

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

991

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

992

++

993

++	return &bgrp->css;

994

++}

995

++

996

++/*

997

++ * We cannot support shared io contexts, as we have no means to support

998

++ * two tasks with the same ioc in two different groups without major rework

999

++ * of the main bic/bfqq data structures.  By now we allow a task to change

1000

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

1001

++ * behavior is that a group containing a task that forked using CLONE_IO

1002

++ * will not be destroyed until the tasks sharing the ioc die.

1003

++ */

1004

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

1005

++			    struct cgroup_taskset *tset)

1006

++{

1007

++	struct task_struct *task;

1008

++	struct io_context *ioc;

1009

++	int ret = 0;

1010

++

1011

++	cgroup_taskset_for_each(task, tset) {

1012

++		/*

1013

++		 * task_lock() is needed to avoid races with

1014

++		 * exit_io_context()

1015

++		 */

1016

++		task_lock(task);

1017

++		ioc = task->io_context;

1018

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1019

++			/*

1020

++			 * ioc == NULL means that the task is either too

1021

++			 * young or exiting: if it has still no ioc the

1022

++			 * ioc can't be shared, if the task is exiting the

1023

++			 * attach will fail anyway, no matter what we

1024

++			 * return here.

1025

++			 */

1026

++			ret = -EINVAL;

1027

++		task_unlock(task);

1028

++		if (ret)

1029

++			break;

1030

++	}

1031

++

1032

++	return ret;

1033

++}

1034

++

1035

++static void bfqio_attach(struct cgroup_subsys_state *css,

1036

++			 struct cgroup_taskset *tset)

1037

++{

1038

++	struct task_struct *task;

1039

++	struct io_context *ioc;

1040

++	struct io_cq *icq;

1041

++

1042

++	/*

1043

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

1044

++	 * new group has not yet been tested.

1045

++	 */

1046

++	cgroup_taskset_for_each(task, tset) {

1047

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1048

++		if (ioc) {

1049

++			/*

1050

++			 * Handle cgroup change here.

1051

++			 */

1052

++			rcu_read_lock();

1053

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1054

++				if (!strncmp(

1055

++					icq->q->elevator->type->elevator_name,

1056

++					"bfq", ELV_NAME_MAX))

1057

++					bfq_bic_change_cgroup(icq_to_bic(icq),

1058

++							      css);

1059

++			rcu_read_unlock();

1060

++			put_io_context(ioc);

1061

++		}

1062

++	}

1063

++}

1064

++

1065

++static void bfqio_destroy(struct cgroup_subsys_state *css)

1066

++{

1067

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1068

++	struct hlist_node *tmp;

1069

++	struct bfq_group *bfqg;

1070

++

1071

++	/*

1072

++	 * Since we are destroying the cgroup, there are no more tasks

1073

++	 * referencing it, and all the RCU grace periods that may have

1074

++	 * referenced it are ended (as the destruction of the parent

1075

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1076

++	 * anything else and we don't need any synchronization.

1077

++	 */

1078

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1079

++		bfq_destroy_group(bgrp, bfqg);

1080

++

1081

++	BUG_ON(!hlist_empty(&bgrp->group_data));

1082

++

1083

++	kfree(bgrp);

1084

++}

1085

++

1086

++static int bfqio_css_online(struct cgroup_subsys_state *css)

1087

++{

1088

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1089

++

1090

++	mutex_lock(&bfqio_mutex);

1091

++	bgrp->online = true;

1092

++	mutex_unlock(&bfqio_mutex);

1093

++

1094

++	return 0;

1095

++}

1096

++

1097

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

1098

++{

1099

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1100

++

1101

++	mutex_lock(&bfqio_mutex);

1102

++	bgrp->online = false;

1103

++	mutex_unlock(&bfqio_mutex);

1104

++}

1105

++

1106

++struct cgroup_subsys bfqio_cgrp_subsys = {

1107

++	.css_alloc = bfqio_create,

1108

++	.css_online = bfqio_css_online,

1109

++	.css_offline = bfqio_css_offline,

1110

++	.can_attach = bfqio_can_attach,

1111

++	.attach = bfqio_attach,

1112

++	.css_free = bfqio_destroy,

1113

++	.legacy_cftypes = bfqio_files,

1114

++};

1115

++#else

1116

++static inline void bfq_init_entity(struct bfq_entity *entity,

1117

++				   struct bfq_group *bfqg)

1118

++{

1119

++	entity->weight = entity->new_weight;

1120

++	entity->orig_weight = entity->new_weight;

1121

++	entity->ioprio = entity->new_ioprio;

1122

++	entity->ioprio_class = entity->new_ioprio_class;

1123

++	entity->sched_data = &bfqg->sched_data;

1124

++}

1125

++

1126

++static inline struct bfq_group *

1127

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1128

++{

1129

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1130

++	return bfqd->root_group;

1131

++}

1132

++

1133

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1134

++				 struct bfq_queue *bfqq,

1135

++				 struct bfq_entity *entity,

1136

++				 struct bfq_group *bfqg)

1137

++{

1138

++}

1139

++

1140

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1141

++{

1142

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1143

++}

1144

++

1145

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1146

++{

1147

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1148

++}

1149

++

1150

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

1151

++{

1152

++	kfree(bfqd->root_group);

1153

++}

1154

++

1155

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1156

++{

1157

++	struct bfq_group *bfqg;

1158

++	int i;

1159

++

1160

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1161

++	if (bfqg == NULL)

1162

++		return NULL;

1163

++

1164

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1165

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1166

++

1167

++	return bfqg;

1168

++}

1169

++#endif

1170

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1171

+new file mode 100644

1172

+index 0000000..7f6b000

1173

+--- /dev/null

1174

++++ b/block/bfq-ioc.c

1175

+@@ -0,0 +1,36 @@

1176

++/*

1177

++ * BFQ: I/O context handling.

1178

++ *

1179

++ * Based on ideas and code from CFQ:

1180

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1181

++ *

1182

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1183

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1184

++ *

1185

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1186

++ */

1187

++

1188

++/**

1189

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1190

++ * @icq: the iocontext queue.

1191

++ */

1192

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1193

++{

1194

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1195

++	return container_of(icq, struct bfq_io_cq, icq);

1196

++}

1197

++

1198

++/**

1199

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1200

++ * @bfqd: the lookup key.

1201

++ * @ioc: the io_context of the process doing I/O.

1202

++ *

1203

++ * Queue lock must be held.

1204

++ */

1205

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1206

++					       struct io_context *ioc)

1207

++{

1208

++	if (ioc)

1209

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1210

++	return NULL;

1211

++}

1212

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1213

+new file mode 100644

1214

+index 0000000..773b2ee

1215

+--- /dev/null

1216

++++ b/block/bfq-iosched.c

1217

+@@ -0,0 +1,3898 @@

1218

++/*

1219

++ * Budget Fair Queueing (BFQ) disk scheduler.

1220

++ *

1221

++ * Based on ideas and code from CFQ:

1222

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1223

++ *

1224

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1225

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1226

++ *

1227

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1228

++ *

1229

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

1230

++ * file.

1231

++ *

1232

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on

1233

++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1234

++ * measured in number of sectors, to processes instead of time slices. The

1235

++ * device is not granted to the in-service process for a given time slice,

1236

++ * but until it has exhausted its assigned budget. This change from the time

1237

++ * to the service domain allows BFQ to distribute the device throughput

1238

++ * among processes as desired, without any distortion due to ZBR, workload

1239

++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

1240

++ * called B-WF2Q+, to schedule processes according to their budgets. More

1241

++ * precisely, BFQ schedules queues associated to processes. Thanks to the

1242

++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

1243

++ * I/O-bound processes issuing sequential requests (to boost the

1244

++ * throughput), and yet guarantee a low latency to interactive and soft

1245

++ * real-time applications.

1246

++ *

1247

++ * BFQ is described in [1], where also a reference to the initial, more

1248

++ * theoretical paper on BFQ can be found. The interested reader can find

1249

++ * in the latter paper full details on the main algorithm, as well as

1250

++ * formulas of the guarantees and formal proofs of all the properties.

1251

++ * With respect to the version of BFQ presented in these papers, this

1252

++ * implementation adds a few more heuristics, such as the one that

1253

++ * guarantees a low latency to soft real-time applications, and a

1254

++ * hierarchical extension based on H-WF2Q+.

1255

++ *

1256

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1257

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1258

++ * complexity derives from the one introduced with EEVDF in [3].

1259

++ *

1260

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1261

++ *     with the BFQ Disk I/O Scheduler'',

1262

++ *     Proceedings of the 5th Annual International Systems and Storage

1263

++ *     Conference (SYSTOR '12), June 2012.

1264

++ *

1265

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1266

++ *

1267

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1268

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1269

++ *     Oct 1997.

1270

++ *

1271

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1272

++ *

1273

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1274

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1275

++ *     Resource Allocation,'' technical report.

1276

++ *

1277

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1278

++ */

1279

++#include <linux/module.h>

1280

++#include <linux/slab.h>

1281

++#include <linux/blkdev.h>

1282

++#include <linux/cgroup.h>

1283

++#include <linux/elevator.h>

1284

++#include <linux/jiffies.h>

1285

++#include <linux/rbtree.h>

1286

++#include <linux/ioprio.h>

1287

++#include "bfq.h"

1288

++#include "blk.h"

1289

++

1290

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1291

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1292

++

1293

++/* Maximum backwards seek, in KiB. */

1294

++static const int bfq_back_max = 16 * 1024;

1295

++

1296

++/* Penalty of a backwards seek, in number of sectors. */

1297

++static const int bfq_back_penalty = 2;

1298

++

1299

++/* Idling period duration, in jiffies. */

1300

++static int bfq_slice_idle = HZ / 125;

1301

++

1302

++/* Default maximum budget values, in sectors and number of requests. */

1303

++static const int bfq_default_max_budget = 16 * 1024;

1304

++static const int bfq_max_budget_async_rq = 4;

1305

++

1306

++/*

1307

++ * Async to sync throughput distribution is controlled as follows:

1308

++ * when an async request is served, the entity is charged the number

1309

++ * of sectors of the request, multiplied by the factor below

1310

++ */

1311

++static const int bfq_async_charge_factor = 10;

1312

++

1313

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1314

++static const int bfq_timeout_sync = HZ / 8;

1315

++static int bfq_timeout_async = HZ / 25;

1316

++

1317

++struct kmem_cache *bfq_pool;

1318

++

1319

++/* Below this threshold (in ms), we consider thinktime immediate. */

1320

++#define BFQ_MIN_TT		2

1321

++

1322

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1323

++#define BFQ_HW_QUEUE_THRESHOLD	4

1324

++#define BFQ_HW_QUEUE_SAMPLES	32

1325

++

1326

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1327

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1328

++

1329

++/* Min samples used for peak rate estimation (for autotuning). */

1330

++#define BFQ_PEAK_RATE_SAMPLES	32

1331

++

1332

++/* Shift used for peak rate fixed precision calculations. */

1333

++#define BFQ_RATE_SHIFT		16

1334

++

1335

++/*

1336

++ * By default, BFQ computes the duration of the weight raising for

1337

++ * interactive applications automatically, using the following formula:

1338

++ * duration = (R / r) * T, where r is the peak rate of the device, and

1339

++ * R and T are two reference parameters.

1340

++ * In particular, R is the peak rate of the reference device (see below),

1341

++ * and T is a reference time: given the systems that are likely to be

1342

++ * installed on the reference device according to its speed class, T is

1343

++ * about the maximum time needed, under BFQ and while reading two files in

1344

++ * parallel, to load typical large applications on these systems.

1345

++ * In practice, the slower/faster the device at hand is, the more/less it

1346

++ * takes to load applications with respect to the reference device.

1347

++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive

1348

++ * applications.

1349

++ *

1350

++ * BFQ uses four different reference pairs (R, T), depending on:

1351

++ * . whether the device is rotational or non-rotational;

1352

++ * . whether the device is slow, such as old or portable HDDs, as well as

1353

++ *   SD cards, or fast, such as newer HDDs and SSDs.

1354

++ *

1355

++ * The device's speed class is dynamically (re)detected in

1356

++ * bfq_update_peak_rate() every time the estimated peak rate is updated.

1357

++ *

1358

++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

1359

++ * are the reference values for a slow/fast rotational device, whereas

1360

++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

1361

++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the

1362

++ * thresholds used to switch between speed classes.

1363

++ * Both the reference peak rates and the thresholds are measured in

1364

++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

1365

++ */

1366

++static int R_slow[2] = {1536, 10752};

1367

++static int R_fast[2] = {17415, 34791};

1368

++/*

1369

++ * To improve readability, a conversion function is used to initialize the

1370

++ * following arrays, which entails that they can be initialized only in a

1371

++ * function.

1372

++ */

1373

++static int T_slow[2];

1374

++static int T_fast[2];

1375

++static int device_speed_thresh[2];

1376

++

1377

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1378

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1379

++

1380

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1381

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1382

++

1383

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1384

++

1385

++#include "bfq-ioc.c"

1386

++#include "bfq-sched.c"

1387

++#include "bfq-cgroup.c"

1388

++

1389

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1390

++				 IOPRIO_CLASS_IDLE)

1391

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1392

++				 IOPRIO_CLASS_RT)

1393

++

1394

++#define bfq_sample_valid(samples)	((samples) > 80)

1395

++

1396

++/*

1397

++ * The following macro groups conditions that need to be evaluated when

1398

++ * checking if existing queues and groups form a symmetric scenario

1399

++ * and therefore idling can be reduced or disabled for some of the

1400

++ * queues. See the comment to the function bfq_bfqq_must_not_expire()

1401

++ * for further details.

1402

++ */

1403

++#ifdef CONFIG_CGROUP_BFQIO

1404

++#define symmetric_scenario	  (!bfqd->active_numerous_groups && \

1405

++				   !bfq_differentiated_weights(bfqd))

1406

++#else

1407

++#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))

1408

++#endif

1409

++

1410

++/*

1411

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1412

++ * set (in which case it could also be a direct WRITE).

1413

++ */

1414

++static inline int bfq_bio_sync(struct bio *bio)

1415

++{

1416

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1417

++		return 1;

1418

++

1419

++	return 0;

1420

++}

1421

++

1422

++/*

1423

++ * Scheduler run of queue, if there are requests pending and no one in the

1424

++ * driver that will restart queueing.

1425

++ */

1426

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1427

++{

1428

++	if (bfqd->queued != 0) {

1429

++		bfq_log(bfqd, "schedule dispatch");

1430

++		kblockd_schedule_work(&bfqd->unplug_work);

1431

++	}

1432

++}

1433

++

1434

++/*

1435

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1436

++ * We choose the request that is closesr to the head right now.  Distance

1437

++ * behind the head is penalized and only allowed to a certain extent.

1438

++ */

1439

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1440

++				      struct request *rq1,

1441

++				      struct request *rq2,

1442

++				      sector_t last)

1443

++{

1444

++	sector_t s1, s2, d1 = 0, d2 = 0;

1445

++	unsigned long back_max;

1446

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1447

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1448

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1449

++

1450

++	if (rq1 == NULL || rq1 == rq2)

1451

++		return rq2;

1452

++	if (rq2 == NULL)

1453

++		return rq1;

1454

++

1455

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1456

++		return rq1;

1457

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1458

++		return rq2;

1459

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1460

++		return rq1;

1461

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1462

++		return rq2;

1463

++

1464

++	s1 = blk_rq_pos(rq1);

1465

++	s2 = blk_rq_pos(rq2);

1466

++

1467

++	/*

1468

++	 * By definition, 1KiB is 2 sectors.

1469

++	 */

1470

++	back_max = bfqd->bfq_back_max * 2;

1471

++

1472

++	/*

1473

++	 * Strict one way elevator _except_ in the case where we allow

1474

++	 * short backward seeks which are biased as twice the cost of a

1475

++	 * similar forward seek.

1476

++	 */

1477

++	if (s1 >= last)

1478

++		d1 = s1 - last;

1479

++	else if (s1 + back_max >= last)

1480

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1481

++	else

1482

++		wrap |= BFQ_RQ1_WRAP;

1483

++

1484

++	if (s2 >= last)

1485

++		d2 = s2 - last;

1486

++	else if (s2 + back_max >= last)

1487

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1488

++	else

1489

++		wrap |= BFQ_RQ2_WRAP;

1490

++

1491

++	/* Found required data */

1492

++

1493

++	/*

1494

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1495

++	 * check two variables for all permutations: --> faster!

1496

++	 */

1497

++	switch (wrap) {

1498

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1499

++		if (d1 < d2)

1500

++			return rq1;

1501

++		else if (d2 < d1)

1502

++			return rq2;

1503

++		else {

1504

++			if (s1 >= s2)

1505

++				return rq1;

1506

++			else

1507

++				return rq2;

1508

++		}

1509

++

1510

++	case BFQ_RQ2_WRAP:

1511

++		return rq1;

1512

++	case BFQ_RQ1_WRAP:

1513

++		return rq2;

1514

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1515

++	default:

1516

++		/*

1517

++		 * Since both rqs are wrapped,

1518

++		 * start with the one that's further behind head

1519

++		 * (--> only *one* back seek required),

1520

++		 * since back seek takes more time than forward.

1521

++		 */

1522

++		if (s1 <= s2)

1523

++			return rq1;

1524

++		else

1525

++			return rq2;

1526

++	}

1527

++}

1528

++

1529

++static struct bfq_queue *

1530

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1531

++		     sector_t sector, struct rb_node **ret_parent,

1532

++		     struct rb_node ***rb_link)

1533

++{

1534

++	struct rb_node **p, *parent;

1535

++	struct bfq_queue *bfqq = NULL;

1536

++

1537

++	parent = NULL;

1538

++	p = &root->rb_node;

1539

++	while (*p) {

1540

++		struct rb_node **n;

1541

++

1542

++		parent = *p;

1543

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1544

++

1545

++		/*

1546

++		 * Sort strictly based on sector. Smallest to the left,

1547

++		 * largest to the right.

1548

++		 */

1549

++		if (sector > blk_rq_pos(bfqq->next_rq))

1550

++			n = &(*p)->rb_right;

1551

++		else if (sector < blk_rq_pos(bfqq->next_rq))

1552

++			n = &(*p)->rb_left;

1553

++		else

1554

++			break;

1555

++		p = n;

1556

++		bfqq = NULL;

1557

++	}

1558

++

1559

++	*ret_parent = parent;

1560

++	if (rb_link)

1561

++		*rb_link = p;

1562

++

1563

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1564

++		(long long unsigned)sector,

1565

++		bfqq != NULL ? bfqq->pid : 0);

1566

++

1567

++	return bfqq;

1568

++}

1569

++

1570

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1571

++{

1572

++	struct rb_node **p, *parent;

1573

++	struct bfq_queue *__bfqq;

1574

++

1575

++	if (bfqq->pos_root != NULL) {

1576

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1577

++		bfqq->pos_root = NULL;

1578

++	}

1579

++

1580

++	if (bfq_class_idle(bfqq))

1581

++		return;

1582

++	if (!bfqq->next_rq)

1583

++		return;

1584

++

1585

++	bfqq->pos_root = &bfqd->rq_pos_tree;

1586

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1587

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

1588

++	if (__bfqq == NULL) {

1589

++		rb_link_node(&bfqq->pos_node, parent, p);

1590

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1591

++	} else

1592

++		bfqq->pos_root = NULL;

1593

++}

1594

++

1595

++/*

1596

++ * Tell whether there are active queues or groups with differentiated weights.

1597

++ */

1598

++static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)

1599

++{

1600

++	/*

1601

++	 * For weights to differ, at least one of the trees must contain

1602

++	 * at least two nodes.

1603

++	 */

1604

++	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&

1605

++		(bfqd->queue_weights_tree.rb_node->rb_left ||

1606

++		 bfqd->queue_weights_tree.rb_node->rb_right)

1607

++#ifdef CONFIG_CGROUP_BFQIO

1608

++	       ) ||

1609

++	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&

1610

++		(bfqd->group_weights_tree.rb_node->rb_left ||

1611

++		 bfqd->group_weights_tree.rb_node->rb_right)

1612

++#endif

1613

++	       );

1614

++}

1615

++

1616

++/*

1617

++ * If the weight-counter tree passed as input contains no counter for

1618

++ * the weight of the input entity, then add that counter; otherwise just

1619

++ * increment the existing counter.

1620

++ *

1621

++ * Note that weight-counter trees contain few nodes in mostly symmetric

1622

++ * scenarios. For example, if all queues have the same weight, then the

1623

++ * weight-counter tree for the queues may contain at most one node.

1624

++ * This holds even if low_latency is on, because weight-raised queues

1625

++ * are not inserted in the tree.

1626

++ * In most scenarios, the rate at which nodes are created/destroyed

1627

++ * should be low too.

1628

++ */

1629

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

1630

++				 struct bfq_entity *entity,

1631

++				 struct rb_root *root)

1632

++{

1633

++	struct rb_node **new = &(root->rb_node), *parent = NULL;

1634

++

1635

++	/*

1636

++	 * Do not insert if the entity is already associated with a

1637

++	 * counter, which happens if:

1638

++	 *   1) the entity is associated with a queue,

1639

++	 *   2) a request arrival has caused the queue to become both

1640

++	 *      non-weight-raised, and hence change its weight, and

1641

++	 *      backlogged; in this respect, each of the two events

1642

++	 *      causes an invocation of this function,

1643

++	 *   3) this is the invocation of this function caused by the

1644

++	 *      second event. This second invocation is actually useless,

1645

++	 *      and we handle this fact by exiting immediately. More

1646

++	 *      efficient or clearer solutions might possibly be adopted.

1647

++	 */

1648

++	if (entity->weight_counter)

1649

++		return;

1650

++

1651

++	while (*new) {

1652

++		struct bfq_weight_counter *__counter = container_of(*new,

1653

++						struct bfq_weight_counter,

1654

++						weights_node);

1655

++		parent = *new;

1656

++

1657

++		if (entity->weight == __counter->weight) {

1658

++			entity->weight_counter = __counter;

1659

++			goto inc_counter;

1660

++		}

1661

++		if (entity->weight < __counter->weight)

1662

++			new = &((*new)->rb_left);

1663

++		else

1664

++			new = &((*new)->rb_right);

1665

++	}

1666

++

1667

++	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),

1668

++					 GFP_ATOMIC);

1669

++	entity->weight_counter->weight = entity->weight;

1670

++	rb_link_node(&entity->weight_counter->weights_node, parent, new);

1671

++	rb_insert_color(&entity->weight_counter->weights_node, root);

1672

++

1673

++inc_counter:

1674

++	entity->weight_counter->num_active++;

1675

++}

1676

++

1677

++/*

1678

++ * Decrement the weight counter associated with the entity, and, if the

1679

++ * counter reaches 0, remove the counter from the tree.

1680

++ * See the comments to the function bfq_weights_tree_add() for considerations

1681

++ * about overhead.

1682

++ */

1683

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

1684

++				    struct bfq_entity *entity,

1685

++				    struct rb_root *root)

1686

++{

1687

++	if (!entity->weight_counter)

1688

++		return;

1689

++

1690

++	BUG_ON(RB_EMPTY_ROOT(root));

1691

++	BUG_ON(entity->weight_counter->weight != entity->weight);

1692

++

1693

++	BUG_ON(!entity->weight_counter->num_active);

1694

++	entity->weight_counter->num_active--;

1695

++	if (entity->weight_counter->num_active > 0)

1696

++		goto reset_entity_pointer;

1697

++

1698

++	rb_erase(&entity->weight_counter->weights_node, root);

1699

++	kfree(entity->weight_counter);

1700

++

1701

++reset_entity_pointer:

1702

++	entity->weight_counter = NULL;

1703

++}

1704

++

1705

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1706

++					struct bfq_queue *bfqq,

1707

++					struct request *last)

1708

++{

1709

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1710

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1711

++	struct request *next = NULL, *prev = NULL;

1712

++

1713

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1714

++

1715

++	if (rbprev != NULL)

1716

++		prev = rb_entry_rq(rbprev);

1717

++

1718

++	if (rbnext != NULL)

1719

++		next = rb_entry_rq(rbnext);

1720

++	else {

1721

++		rbnext = rb_first(&bfqq->sort_list);

1722

++		if (rbnext && rbnext != &last->rb_node)

1723

++			next = rb_entry_rq(rbnext);

1724

++	}

1725

++

1726

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1727

++}

1728

++

1729

++/* see the definition of bfq_async_charge_factor for details */

1730

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

1731

++					       struct bfq_queue *bfqq)

1732

++{

1733

++	return blk_rq_sectors(rq) *

1734

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

1735

++		bfq_async_charge_factor));

1736

++}

1737

++

1738

++/**

1739

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1740

++ * @bfqd: the device data the queue belongs to.

1741

++ * @bfqq: the queue to update.

1742

++ *

1743

++ * If the first request of a queue changes we make sure that the queue

1744

++ * has enough budget to serve at least its first request (if the

1745

++ * request has grown).  We do this because if the queue has not enough

1746

++ * budget for its first request, it has to go through two dispatch

1747

++ * rounds to actually get it dispatched.

1748

++ */

1749

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1750

++				 struct bfq_queue *bfqq)

1751

++{

1752

++	struct bfq_entity *entity = &bfqq->entity;

1753

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1754

++	struct request *next_rq = bfqq->next_rq;

1755

++	unsigned long new_budget;

1756

++

1757

++	if (next_rq == NULL)

1758

++		return;

1759

++

1760

++	if (bfqq == bfqd->in_service_queue)

1761

++		/*

1762

++		 * In order not to break guarantees, budgets cannot be

1763

++		 * changed after an entity has been selected.

1764

++		 */

1765

++		return;

1766

++

1767

++	BUG_ON(entity->tree != &st->active);

1768

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1769

++

1770

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1771

++			   bfq_serv_to_charge(next_rq, bfqq));

1772

++	if (entity->budget != new_budget) {

1773

++		entity->budget = new_budget;

1774

++		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",

1775

++					 new_budget);

1776

++		bfq_activate_bfqq(bfqd, bfqq);

1777

++	}

1778

++}

1779

++

1780

++static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

1781

++{

1782

++	u64 dur;

1783

++

1784

++	if (bfqd->bfq_wr_max_time > 0)

1785

++		return bfqd->bfq_wr_max_time;

1786

++

1787

++	dur = bfqd->RT_prod;

1788

++	do_div(dur, bfqd->peak_rate);

1789

++

1790

++	return dur;

1791

++}

1792

++

1793

++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

1794

++static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

1795

++					struct bfq_queue *bfqq)

1796

++{

1797

++	struct bfq_queue *item;

1798

++	struct hlist_node *n;

1799

++

1800

++	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)

1801

++		hlist_del_init(&item->burst_list_node);

1802

++	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1803

++	bfqd->burst_size = 1;

1804

++}

1805

++

1806

++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

1807

++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1808

++{

1809

++	/* Increment burst size to take into account also bfqq */

1810

++	bfqd->burst_size++;

1811

++

1812

++	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

1813

++		struct bfq_queue *pos, *bfqq_item;

1814

++		struct hlist_node *n;

1815

++

1816

++		/*

1817

++		 * Enough queues have been activated shortly after each

1818

++		 * other to consider this burst as large.

1819

++		 */

1820

++		bfqd->large_burst = true;

1821

++

1822

++		/*

1823

++		 * We can now mark all queues in the burst list as

1824

++		 * belonging to a large burst.

1825

++		 */

1826

++		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

1827

++				     burst_list_node)

1828

++		        bfq_mark_bfqq_in_large_burst(bfqq_item);

1829

++		bfq_mark_bfqq_in_large_burst(bfqq);

1830

++

1831

++		/*

1832

++		 * From now on, and until the current burst finishes, any

1833

++		 * new queue being activated shortly after the last queue

1834

++		 * was inserted in the burst can be immediately marked as

1835

++		 * belonging to a large burst. So the burst list is not

1836

++		 * needed any more. Remove it.

1837

++		 */

1838

++		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

1839

++					  burst_list_node)

1840

++			hlist_del_init(&pos->burst_list_node);

1841

++	} else /* burst not yet large: add bfqq to the burst list */

1842

++		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1843

++}

1844

++

1845

++/*

1846

++ * If many queues happen to become active shortly after each other, then,

1847

++ * to help the processes associated to these queues get their job done as

1848

++ * soon as possible, it is usually better to not grant either weight-raising

1849

++ * or device idling to these queues. In this comment we describe, firstly,

1850

++ * the reasons why this fact holds, and, secondly, the next function, which

1851

++ * implements the main steps needed to properly mark these queues so that

1852

++ * they can then be treated in a different way.

1853

++ *

1854

++ * As for the terminology, we say that a queue becomes active, i.e.,

1855

++ * switches from idle to backlogged, either when it is created (as a

1856

++ * consequence of the arrival of an I/O request), or, if already existing,

1857

++ * when a new request for the queue arrives while the queue is idle.

1858

++ * Bursts of activations, i.e., activations of different queues occurring

1859

++ * shortly after each other, are typically caused by services or applications

1860

++ * that spawn or reactivate many parallel threads/processes. Examples are

1861

++ * systemd during boot or git grep.

1862

++ *

1863

++ * These services or applications benefit mostly from a high throughput:

1864

++ * the quicker the requests of the activated queues are cumulatively served,

1865

++ * the sooner the target job of these queues gets completed. As a consequence,

1866

++ * weight-raising any of these queues, which also implies idling the device

1867

++ * for it, is almost always counterproductive: in most cases it just lowers

1868

++ * throughput.

1869

++ *

1870

++ * On the other hand, a burst of activations may be also caused by the start

1871

++ * of an application that does not consist in a lot of parallel I/O-bound

1872

++ * threads. In fact, with a complex application, the burst may be just a

1873

++ * consequence of the fact that several processes need to be executed to

1874

++ * start-up the application. To start an application as quickly as possible,

1875

++ * the best thing to do is to privilege the I/O related to the application

1876

++ * with respect to all other I/O. Therefore, the best strategy to start as

1877

++ * quickly as possible an application that causes a burst of activations is

1878

++ * to weight-raise all the queues activated during the burst. This is the

1879

++ * exact opposite of the best strategy for the other type of bursts.

1880

++ *

1881

++ * In the end, to take the best action for each of the two cases, the two

1882

++ * types of bursts need to be distinguished. Fortunately, this seems

1883

++ * relatively easy to do, by looking at the sizes of the bursts. In

1884

++ * particular, we found a threshold such that bursts with a larger size

1885

++ * than that threshold are apparently caused only by services or commands

1886

++ * such as systemd or git grep. For brevity, hereafter we call just 'large'

1887

++ * these bursts. BFQ *does not* weight-raise queues whose activations occur

1888

++ * in a large burst. In addition, for each of these queues BFQ performs or

1889

++ * does not perform idling depending on which choice boosts the throughput

1890

++ * most. The exact choice depends on the device and request pattern at

1891

++ * hand.

1892

++ *

1893

++ * Turning back to the next function, it implements all the steps needed

1894

++ * to detect the occurrence of a large burst and to properly mark all the

1895

++ * queues belonging to it (so that they can then be treated in a different

1896

++ * way). This goal is achieved by maintaining a special "burst list" that

1897

++ * holds, temporarily, the queues that belong to the burst in progress. The

1898

++ * list is then used to mark these queues as belonging to a large burst if

1899

++ * the burst does become large. The main steps are the following.

1900

++ *

1901

++ * . when the very first queue is activated, the queue is inserted into the

1902

++ *   list (as it could be the first queue in a possible burst)

1903

++ *

1904

++ * . if the current burst has not yet become large, and a queue Q that does

1905

++ *   not yet belong to the burst is activated shortly after the last time

1906

++ *   at which a new queue entered the burst list, then the function appends

1907

++ *   Q to the burst list

1908

++ *

1909

++ * . if, as a consequence of the previous step, the burst size reaches

1910

++ *   the large-burst threshold, then

1911

++ *

1912

++ *     . all the queues in the burst list are marked as belonging to a

1913

++ *       large burst

1914

++ *

1915

++ *     . the burst list is deleted; in fact, the burst list already served

1916

++ *       its purpose (keeping temporarily track of the queues in a burst,

1917

++ *       so as to be able to mark them as belonging to a large burst in the

1918

++ *       previous sub-step), and now is not needed any more

1919

++ *

1920

++ *     . the device enters a large-burst mode

1921

++ *

1922

++ * . if a queue Q that does not belong to the burst is activated while

1923

++ *   the device is in large-burst mode and shortly after the last time

1924

++ *   at which a queue either entered the burst list or was marked as

1925

++ *   belonging to the current large burst, then Q is immediately marked

1926

++ *   as belonging to a large burst.

1927

++ *

1928

++ * . if a queue Q that does not belong to the burst is activated a while

1929

++ *   later, i.e., not shortly after, than the last time at which a queue

1930

++ *   either entered the burst list or was marked as belonging to the

1931

++ *   current large burst, then the current burst is deemed as finished and:

1932

++ *

1933

++ *        . the large-burst mode is reset if set

1934

++ *

1935

++ *        . the burst list is emptied

1936

++ *

1937

++ *        . Q is inserted in the burst list, as Q may be the first queue

1938

++ *          in a possible new burst (then the burst list contains just Q

1939

++ *          after this step).

1940

++ */

1941

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

1942

++			     bool idle_for_long_time)

1943

++{

1944

++	/*

1945

++	 * If bfqq happened to be activated in a burst, but has been idle

1946

++	 * for at least as long as an interactive queue, then we assume

1947

++	 * that, in the overall I/O initiated in the burst, the I/O

1948

++	 * associated to bfqq is finished. So bfqq does not need to be

1949

++	 * treated as a queue belonging to a burst anymore. Accordingly,

1950

++	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

1951

++	 * from the burst list if it's there. We do not decrement instead

1952

++	 * burst_size, because the fact that bfqq does not need to belong

1953

++	 * to the burst list any more does not invalidate the fact that

1954

++	 * bfqq may have been activated during the current burst.

1955

++	 */

1956

++	if (idle_for_long_time) {

1957

++		hlist_del_init(&bfqq->burst_list_node);

1958

++		bfq_clear_bfqq_in_large_burst(bfqq);

1959

++	}

1960

++

1961

++	/*

1962

++	 * If bfqq is already in the burst list or is part of a large

1963

++	 * burst, then there is nothing else to do.

1964

++	 */

1965

++	if (!hlist_unhashed(&bfqq->burst_list_node) ||

1966

++	    bfq_bfqq_in_large_burst(bfqq))

1967

++		return;

1968

++

1969

++	/*

1970

++	 * If bfqq's activation happens late enough, then the current

1971

++	 * burst is finished, and related data structures must be reset.

1972

++	 *

1973

++	 * In this respect, consider the special case where bfqq is the very

1974

++	 * first queue being activated. In this case, last_ins_in_burst is

1975

++	 * not yet significant when we get here. But it is easy to verify

1976

++	 * that, whether or not the following condition is true, bfqq will

1977

++	 * end up being inserted into the burst list. In particular the

1978

++	 * list will happen to contain only bfqq. And this is exactly what

1979

++	 * has to happen, as bfqq may be the first queue in a possible

1980

++	 * burst.

1981

++	 */

1982

++	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

1983

++	    bfqd->bfq_burst_interval)) {

1984

++		bfqd->large_burst = false;

1985

++		bfq_reset_burst_list(bfqd, bfqq);

1986

++		return;

1987

++	}

1988

++

1989

++	/*

1990

++	 * If we get here, then bfqq is being activated shortly after the

1991

++	 * last queue. So, if the current burst is also large, we can mark

1992

++	 * bfqq as belonging to this large burst immediately.

1993

++	 */

1994

++	if (bfqd->large_burst) {

1995

++		bfq_mark_bfqq_in_large_burst(bfqq);

1996

++		return;

1997

++	}

1998

++

1999

++	/*

2000

++	 * If we get here, then a large-burst state has not yet been

2001

++	 * reached, but bfqq is being activated shortly after the last

2002

++	 * queue. Then we add bfqq to the burst.

2003

++	 */

2004

++	bfq_add_to_burst(bfqd, bfqq);

2005

++}

2006

++

2007

++static void bfq_add_request(struct request *rq)

2008

++{

2009

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2010

++	struct bfq_entity *entity = &bfqq->entity;

2011

++	struct bfq_data *bfqd = bfqq->bfqd;

2012

++	struct request *next_rq, *prev;

2013

++	unsigned long old_wr_coeff = bfqq->wr_coeff;

2014

++	bool interactive = false;

2015

++

2016

++	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

2017

++	bfqq->queued[rq_is_sync(rq)]++;

2018

++	bfqd->queued++;

2019

++

2020

++	elv_rb_add(&bfqq->sort_list, rq);

2021

++

2022

++	/*

2023

++	 * Check if this request is a better next-serve candidate.

2024

++	 */

2025

++	prev = bfqq->next_rq;

2026

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

2027

++	BUG_ON(next_rq == NULL);

2028

++	bfqq->next_rq = next_rq;

2029

++

2030

++	/*

2031

++	 * Adjust priority tree position, if next_rq changes.

2032

++	 */

2033

++	if (prev != bfqq->next_rq)

2034

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2035

++

2036

++	if (!bfq_bfqq_busy(bfqq)) {

2037

++		bool soft_rt,

2038

++		     idle_for_long_time = time_is_before_jiffies(

2039

++						bfqq->budget_timeout +

2040

++						bfqd->bfq_wr_min_idle_time);

2041

++

2042

++		if (bfq_bfqq_sync(bfqq)) {

2043

++			bool already_in_burst =

2044

++			   !hlist_unhashed(&bfqq->burst_list_node) ||

2045

++			   bfq_bfqq_in_large_burst(bfqq);

2046

++			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

2047

++			/*

2048

++			 * If bfqq was not already in the current burst,

2049

++			 * then, at this point, bfqq either has been

2050

++			 * added to the current burst or has caused the

2051

++			 * current burst to terminate. In particular, in

2052

++			 * the second case, bfqq has become the first

2053

++			 * queue in a possible new burst.

2054

++			 * In both cases last_ins_in_burst needs to be

2055

++			 * moved forward.

2056

++			 */

2057

++			if (!already_in_burst)

2058

++				bfqd->last_ins_in_burst = jiffies;

2059

++		}

2060

++

2061

++		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

2062

++			!bfq_bfqq_in_large_burst(bfqq) &&

2063

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

2064

++		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

2065

++			      idle_for_long_time;

2066

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

2067

++				       bfq_serv_to_charge(next_rq, bfqq));

2068

++

2069

++		if (!bfq_bfqq_IO_bound(bfqq)) {

2070

++			if (time_before(jiffies,

2071

++					RQ_BIC(rq)->ttime.last_end_request +

2072

++					bfqd->bfq_slice_idle)) {

2073

++				bfqq->requests_within_timer++;

2074

++				if (bfqq->requests_within_timer >=

2075

++				    bfqd->bfq_requests_within_timer)

2076

++					bfq_mark_bfqq_IO_bound(bfqq);

2077

++			} else

2078

++				bfqq->requests_within_timer = 0;

2079

++		}

2080

++

2081

++		if (!bfqd->low_latency)

2082

++			goto add_bfqq_busy;

2083

++

2084

++		/*

2085

++		 * If the queue is not being boosted and has been idle

2086

++		 * for enough time, start a weight-raising period

2087

++		 */

2088

++		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

2089

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2090

++			if (interactive)

2091

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2092

++			else

2093

++				bfqq->wr_cur_max_time =

2094

++					bfqd->bfq_wr_rt_max_time;

2095

++			bfq_log_bfqq(bfqd, bfqq,

2096

++				     "wrais starting at %lu, rais_max_time %u",

2097

++				     jiffies,

2098

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

2099

++		} else if (old_wr_coeff > 1) {

2100

++			if (interactive)

2101

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2102

++			else if (bfq_bfqq_in_large_burst(bfqq) ||

2103

++				 (bfqq->wr_cur_max_time ==

2104

++				  bfqd->bfq_wr_rt_max_time &&

2105

++				  !soft_rt)) {

2106

++				bfqq->wr_coeff = 1;

2107

++				bfq_log_bfqq(bfqd, bfqq,

2108

++					"wrais ending at %lu, rais_max_time %u",

2109

++					jiffies,

2110

++					jiffies_to_msecs(bfqq->

2111

++						wr_cur_max_time));

2112

++			} else if (time_before(

2113

++					bfqq->last_wr_start_finish +

2114

++					bfqq->wr_cur_max_time,

2115

++					jiffies +

2116

++					bfqd->bfq_wr_rt_max_time) &&

2117

++				   soft_rt) {

2118

++				/*

2119

++				 *

2120

++				 * The remaining weight-raising time is lower

2121

++				 * than bfqd->bfq_wr_rt_max_time, which

2122

++				 * means that the application is enjoying

2123

++				 * weight raising either because deemed soft-

2124

++				 * rt in the near past, or because deemed

2125

++				 * interactive a long ago. In both cases,

2126

++				 * resetting now the current remaining weight-

2127

++				 * raising time for the application to the

2128

++				 * weight-raising duration for soft rt

2129

++				 * applications would not cause any latency

2130

++				 * increase for the application (as the new

2131

++				 * duration would be higher than the remaining

2132

++				 * time).

2133

++				 *

2134

++				 * In addition, the application is now meeting

2135

++				 * the requirements for being deemed soft rt.

2136

++				 * In the end we can correctly and safely

2137

++				 * (re)charge the weight-raising duration for

2138

++				 * the application with the weight-raising

2139

++				 * duration for soft rt applications.

2140

++				 *

2141

++				 * In particular, doing this recharge now, i.e.,

2142

++				 * before the weight-raising period for the

2143

++				 * application finishes, reduces the probability

2144

++				 * of the following negative scenario:

2145

++				 * 1) the weight of a soft rt application is

2146

++				 *    raised at startup (as for any newly

2147

++				 *    created application),

2148

++				 * 2) since the application is not interactive,

2149

++				 *    at a certain time weight-raising is

2150

++				 *    stopped for the application,

2151

++				 * 3) at that time the application happens to

2152

++				 *    still have pending requests, and hence

2153

++				 *    is destined to not have a chance to be

2154

++				 *    deemed soft rt before these requests are

2155

++				 *    completed (see the comments to the

2156

++				 *    function bfq_bfqq_softrt_next_start()

2157

++				 *    for details on soft rt detection),

2158

++				 * 4) these pending requests experience a high

2159

++				 *    latency because the application is not

2160

++				 *    weight-raised while they are pending.

2161

++				 */

2162

++				bfqq->last_wr_start_finish = jiffies;

2163

++				bfqq->wr_cur_max_time =

2164

++					bfqd->bfq_wr_rt_max_time;

2165

++			}

2166

++		}

2167

++		if (old_wr_coeff != bfqq->wr_coeff)

2168

++			entity->ioprio_changed = 1;

2169

++add_bfqq_busy:

2170

++		bfqq->last_idle_bklogged = jiffies;

2171

++		bfqq->service_from_backlogged = 0;

2172

++		bfq_clear_bfqq_softrt_update(bfqq);

2173

++		bfq_add_bfqq_busy(bfqd, bfqq);

2174

++	} else {

2175

++		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

2176

++		    time_is_before_jiffies(

2177

++				bfqq->last_wr_start_finish +

2178

++				bfqd->bfq_wr_min_inter_arr_async)) {

2179

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2180

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2181

++

2182

++			bfqd->wr_busy_queues++;

2183

++			entity->ioprio_changed = 1;

2184

++			bfq_log_bfqq(bfqd, bfqq,

2185

++			    "non-idle wrais starting at %lu, rais_max_time %u",

2186

++			    jiffies,

2187

++			    jiffies_to_msecs(bfqq->wr_cur_max_time));

2188

++		}

2189

++		if (prev != bfqq->next_rq)

2190

++			bfq_updated_next_req(bfqd, bfqq);

2191

++	}

2192

++

2193

++	if (bfqd->low_latency &&

2194

++		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

2195

++		bfqq->last_wr_start_finish = jiffies;

2196

++}

2197

++

2198

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

2199

++					  struct bio *bio)

2200

++{

2201

++	struct task_struct *tsk = current;

2202

++	struct bfq_io_cq *bic;

2203

++	struct bfq_queue *bfqq;

2204

++

2205

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

2206

++	if (bic == NULL)

2207

++		return NULL;

2208

++

2209

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2210

++	if (bfqq != NULL)

2211

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

2212

++

2213

++	return NULL;

2214

++}

2215

++

2216

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

2217

++{

2218

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2219

++

2220

++	bfqd->rq_in_driver++;

2221

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

2222

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

2223

++		(long long unsigned)bfqd->last_position);

2224

++}

2225

++

2226

++static inline void bfq_deactivate_request(struct request_queue *q,

2227

++					  struct request *rq)

2228

++{

2229

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2230

++

2231

++	BUG_ON(bfqd->rq_in_driver == 0);

2232

++	bfqd->rq_in_driver--;

2233

++}

2234

++

2235

++static void bfq_remove_request(struct request *rq)

2236

++{

2237

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2238

++	struct bfq_data *bfqd = bfqq->bfqd;

2239

++	const int sync = rq_is_sync(rq);

2240

++

2241

++	if (bfqq->next_rq == rq) {

2242

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

2243

++		bfq_updated_next_req(bfqd, bfqq);

2244

++	}

2245

++

2246

++	if (rq->queuelist.prev != &rq->queuelist)

2247

++		list_del_init(&rq->queuelist);

2248

++	BUG_ON(bfqq->queued[sync] == 0);

2249

++	bfqq->queued[sync]--;

2250

++	bfqd->queued--;

2251

++	elv_rb_del(&bfqq->sort_list, rq);

2252

++

2253

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2254

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

2255

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

2256

++		/*

2257

++		 * Remove queue from request-position tree as it is empty.

2258

++		 */

2259

++		if (bfqq->pos_root != NULL) {

2260

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

2261

++			bfqq->pos_root = NULL;

2262

++		}

2263

++	}

2264

++

2265

++	if (rq->cmd_flags & REQ_META) {

2266

++		BUG_ON(bfqq->meta_pending == 0);

2267

++		bfqq->meta_pending--;

2268

++	}

2269

++}

2270

++

2271

++static int bfq_merge(struct request_queue *q, struct request **req,

2272

++		     struct bio *bio)

2273

++{

2274

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2275

++	struct request *__rq;

2276

++

2277

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

2278

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

2279

++		*req = __rq;

2280

++		return ELEVATOR_FRONT_MERGE;

2281

++	}

2282

++

2283

++	return ELEVATOR_NO_MERGE;

2284

++}

2285

++

2286

++static void bfq_merged_request(struct request_queue *q, struct request *req,

2287

++			       int type)

2288

++{

2289

++	if (type == ELEVATOR_FRONT_MERGE &&

2290

++	    rb_prev(&req->rb_node) &&

2291

++	    blk_rq_pos(req) <

2292

++	    blk_rq_pos(container_of(rb_prev(&req->rb_node),

2293

++				    struct request, rb_node))) {

2294

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

2295

++		struct bfq_data *bfqd = bfqq->bfqd;

2296

++		struct request *prev, *next_rq;

2297

++

2298

++		/* Reposition request in its sort_list */

2299

++		elv_rb_del(&bfqq->sort_list, req);

2300

++		elv_rb_add(&bfqq->sort_list, req);

2301

++		/* Choose next request to be served for bfqq */

2302

++		prev = bfqq->next_rq;

2303

++		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,

2304

++					 bfqd->last_position);

2305

++		BUG_ON(next_rq == NULL);

2306

++		bfqq->next_rq = next_rq;

2307

++		/*

2308

++		 * If next_rq changes, update both the queue's budget to

2309

++		 * fit the new request and the queue's position in its

2310

++		 * rq_pos_tree.

2311

++		 */

2312

++		if (prev != bfqq->next_rq) {

2313

++			bfq_updated_next_req(bfqd, bfqq);

2314

++			bfq_rq_pos_tree_add(bfqd, bfqq);

2315

++		}

2316

++	}

2317

++}

2318

++

2319

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2320

++				struct request *next)

2321

++{

2322

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);

2323

++

2324

++	/*

2325

++	 * If next and rq belong to the same bfq_queue and next is older

2326

++	 * than rq, then reposition rq in the fifo (by substituting next

2327

++	 * with rq). Otherwise, if next and rq belong to different

2328

++	 * bfq_queues, never reposition rq: in fact, we would have to

2329

++	 * reposition it with respect to next's position in its own fifo,

2330

++	 * which would most certainly be too expensive with respect to

2331

++	 * the benefits.

2332

++	 */

2333

++	if (bfqq == next_bfqq &&

2334

++	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2335

++	    time_before(next->fifo_time, rq->fifo_time)) {

2336

++		list_del_init(&rq->queuelist);

2337

++		list_replace_init(&next->queuelist, &rq->queuelist);

2338

++		rq->fifo_time = next->fifo_time;

2339

++	}

2340

++

2341

++	if (bfqq->next_rq == next)

2342

++		bfqq->next_rq = rq;

2343

++

2344

++	bfq_remove_request(next);

2345

++}

2346

++

2347

++/* Must be called with bfqq != NULL */

2348

++static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

2349

++{

2350

++	BUG_ON(bfqq == NULL);

2351

++	if (bfq_bfqq_busy(bfqq))

2352

++		bfqq->bfqd->wr_busy_queues--;

2353

++	bfqq->wr_coeff = 1;

2354

++	bfqq->wr_cur_max_time = 0;

2355

++	/* Trigger a weight change on the next activation of the queue */

2356

++	bfqq->entity.ioprio_changed = 1;

2357

++}

2358

++

2359

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

2360

++				    struct bfq_group *bfqg)

2361

++{

2362

++	int i, j;

2363

++

2364

++	for (i = 0; i < 2; i++)

2365

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2366

++			if (bfqg->async_bfqq[i][j] != NULL)

2367

++				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);

2368

++	if (bfqg->async_idle_bfqq != NULL)

2369

++		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);

2370

++}

2371

++

2372

++static void bfq_end_wr(struct bfq_data *bfqd)

2373

++{

2374

++	struct bfq_queue *bfqq;

2375

++

2376

++	spin_lock_irq(bfqd->queue->queue_lock);

2377

++

2378

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2379

++		bfq_bfqq_end_wr(bfqq);

2380

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2381

++		bfq_bfqq_end_wr(bfqq);

2382

++	bfq_end_wr_async(bfqd);

2383

++

2384

++	spin_unlock_irq(bfqd->queue->queue_lock);

2385

++}

2386

++

2387

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2388

++			   struct bio *bio)

2389

++{

2390

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2391

++	struct bfq_io_cq *bic;

2392

++	struct bfq_queue *bfqq;

2393

++

2394

++	/*

2395

++	 * Disallow merge of a sync bio into an async request.

2396

++	 */

2397

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2398

++		return 0;

2399

++

2400

++	/*

2401

++	 * Lookup the bfqq that this bio will be queued with. Allow

2402

++	 * merge only if rq is queued there.

2403

++	 * Queue lock is held here.

2404

++	 */

2405

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2406

++	if (bic == NULL)

2407

++		return 0;

2408

++

2409

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2410

++	return bfqq == RQ_BFQQ(rq);

2411

++}

2412

++

2413

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2414

++				       struct bfq_queue *bfqq)

2415

++{

2416

++	if (bfqq != NULL) {

2417

++		bfq_mark_bfqq_must_alloc(bfqq);

2418

++		bfq_mark_bfqq_budget_new(bfqq);

2419

++		bfq_clear_bfqq_fifo_expire(bfqq);

2420

++

2421

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2422

++

2423

++		bfq_log_bfqq(bfqd, bfqq,

2424

++			     "set_in_service_queue, cur-budget = %lu",

2425

++			     bfqq->entity.budget);

2426

++	}

2427

++

2428

++	bfqd->in_service_queue = bfqq;

2429

++}

2430

++

2431

++/*

2432

++ * Get and set a new queue for service.

2433

++ */

2434

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2435

++						  struct bfq_queue *bfqq)

2436

++{

2437

++	if (!bfqq)

2438

++		bfqq = bfq_get_next_queue(bfqd);

2439

++	else

2440

++		bfq_get_next_queue_forced(bfqd, bfqq);

2441

++

2442

++	__bfq_set_in_service_queue(bfqd, bfqq);

2443

++	return bfqq;

2444

++}

2445

++

2446

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2447

++					  struct request *rq)

2448

++{

2449

++	if (blk_rq_pos(rq) >= bfqd->last_position)

2450

++		return blk_rq_pos(rq) - bfqd->last_position;

2451

++	else

2452

++		return bfqd->last_position - blk_rq_pos(rq);

2453

++}

2454

++

2455

++/*

2456

++ * Return true if bfqq has no request pending and rq is close enough to

2457

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2458

++ * bfqq->next_rq

2459

++ */

2460

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2461

++{

2462

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2463

++}

2464

++

2465

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2466

++{

2467

++	struct rb_root *root = &bfqd->rq_pos_tree;

2468

++	struct rb_node *parent, *node;

2469

++	struct bfq_queue *__bfqq;

2470

++	sector_t sector = bfqd->last_position;

2471

++

2472

++	if (RB_EMPTY_ROOT(root))

2473

++		return NULL;

2474

++

2475

++	/*

2476

++	 * First, if we find a request starting at the end of the last

2477

++	 * request, choose it.

2478

++	 */

2479

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2480

++	if (__bfqq != NULL)

2481

++		return __bfqq;

2482

++

2483

++	/*

2484

++	 * If the exact sector wasn't found, the parent of the NULL leaf

2485

++	 * will contain the closest sector (rq_pos_tree sorted by

2486

++	 * next_request position).

2487

++	 */

2488

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2489

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2490

++		return __bfqq;

2491

++

2492

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

2493

++		node = rb_next(&__bfqq->pos_node);

2494

++	else

2495

++		node = rb_prev(&__bfqq->pos_node);

2496

++	if (node == NULL)

2497

++		return NULL;

2498

++

2499

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2500

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2501

++		return __bfqq;

2502

++

2503

++	return NULL;

2504

++}

2505

++

2506

++/*

2507

++ * bfqd - obvious

2508

++ * cur_bfqq - passed in so that we don't decide that the current queue

2509

++ *            is closely cooperating with itself.

2510

++ *

2511

++ * We are assuming that cur_bfqq has dispatched at least one request,

2512

++ * and that bfqd->last_position reflects a position on the disk associated

2513

++ * with the I/O issued by cur_bfqq.

2514

++ */

2515

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2516

++					      struct bfq_queue *cur_bfqq)

2517

++{

2518

++	struct bfq_queue *bfqq;

2519

++

2520

++	if (bfq_class_idle(cur_bfqq))

2521

++		return NULL;

2522

++	if (!bfq_bfqq_sync(cur_bfqq))

2523

++		return NULL;

2524

++	if (BFQQ_SEEKY(cur_bfqq))

2525

++		return NULL;

2526

++

2527

++	/* If device has only one backlogged bfq_queue, don't search. */

2528

++	if (bfqd->busy_queues == 1)

2529

++		return NULL;

2530

++

2531

++	/*

2532

++	 * We should notice if some of the queues are cooperating, e.g.

2533

++	 * working closely on the same area of the disk. In that case,

2534

++	 * we can group them together and don't waste time idling.

2535

++	 */

2536

++	bfqq = bfqq_close(bfqd);

2537

++	if (bfqq == NULL || bfqq == cur_bfqq)

2538

++		return NULL;

2539

++

2540

++	/*

2541

++	 * Do not merge queues from different bfq_groups.

2542

++	*/

2543

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2544

++		return NULL;

2545

++

2546

++	/*

2547

++	 * It only makes sense to merge sync queues.

2548

++	 */

2549

++	if (!bfq_bfqq_sync(bfqq))

2550

++		return NULL;

2551

++	if (BFQQ_SEEKY(bfqq))

2552

++		return NULL;

2553

++

2554

++	/*

2555

++	 * Do not merge queues of different priority classes.

2556

++	 */

2557

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2558

++		return NULL;

2559

++

2560

++	return bfqq;

2561

++}

2562

++

2563

++/*

2564

++ * If enough samples have been computed, return the current max budget

2565

++ * stored in bfqd, which is dynamically updated according to the

2566

++ * estimated disk peak rate; otherwise return the default max budget

2567

++ */

2568

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2569

++{

2570

++	if (bfqd->budgets_assigned < 194)

2571

++		return bfq_default_max_budget;

2572

++	else

2573

++		return bfqd->bfq_max_budget;

2574

++}

2575

++

2576

++/*

2577

++ * Return min budget, which is a fraction of the current or default

2578

++ * max budget (trying with 1/32)

2579

++ */

2580

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2581

++{

2582

++	if (bfqd->budgets_assigned < 194)

2583

++		return bfq_default_max_budget / 32;

2584

++	else

2585

++		return bfqd->bfq_max_budget / 32;

2586

++}

2587

++

2588

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2589

++{

2590

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2591

++	struct bfq_io_cq *bic;

2592

++	unsigned long sl;

2593

++

2594

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2595

++

2596

++	/* Processes have exited, don't wait. */

2597

++	bic = bfqd->in_service_bic;

2598

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2599

++		return;

2600

++

2601

++	bfq_mark_bfqq_wait_request(bfqq);

2602

++

2603

++	/*

2604

++	 * We don't want to idle for seeks, but we do want to allow

2605

++	 * fair distribution of slice time for a process doing back-to-back

2606

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2607

++	 *

2608

++	 * To prevent processes with (partly) seeky workloads from

2609

++	 * being too ill-treated, grant them a small fraction of the

2610

++	 * assigned budget before reducing the waiting time to

2611

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2612

++	 */

2613

++	sl = bfqd->bfq_slice_idle;

2614

++	/*

2615

++	 * Unless the queue is being weight-raised or the scenario is

2616

++	 * asymmetric, grant only minimum idle time if the queue either

2617

++	 * has been seeky for long enough or has already proved to be

2618

++	 * constantly seeky.

2619

++	 */

2620

++	if (bfq_sample_valid(bfqq->seek_samples) &&

2621

++	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

2622

++				  bfq_max_budget(bfqq->bfqd) / 8) ||

2623

++	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&

2624

++	    symmetric_scenario)

2625

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2626

++	else if (bfqq->wr_coeff > 1)

2627

++		sl = sl * 3;

2628

++	bfqd->last_idling_start = ktime_get();

2629

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2630

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2631

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2632

++}

2633

++

2634

++/*

2635

++ * Set the maximum time for the in-service queue to consume its

2636

++ * budget. This prevents seeky processes from lowering the disk

2637

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2638

++ */

2639

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2640

++{

2641

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2642

++	unsigned int timeout_coeff;

2643

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

2644

++		timeout_coeff = 1;

2645

++	else

2646

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2647

++

2648

++	bfqd->last_budget_start = ktime_get();

2649

++

2650

++	bfq_clear_bfqq_budget_new(bfqq);

2651

++	bfqq->budget_timeout = jiffies +

2652

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2653

++

2654

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2655

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2656

++		timeout_coeff));

2657

++}

2658

++

2659

++/*

2660

++ * Move request from internal lists to the request queue dispatch list.

2661

++ */

2662

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2663

++{

2664

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2665

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2666

++

2667

++	/*

2668

++	 * For consistency, the next instruction should have been executed

2669

++	 * after removing the request from the queue and dispatching it.

2670

++	 * We execute instead this instruction before bfq_remove_request()

2671

++	 * (and hence introduce a temporary inconsistency), for efficiency.

2672

++	 * In fact, in a forced_dispatch, this prevents two counters related

2673

++	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq

2674

++	 * is not in service, and then to be incremented again after

2675

++	 * incrementing bfqq->dispatched.

2676

++	 */

2677

++	bfqq->dispatched++;

2678

++	bfq_remove_request(rq);

2679

++	elv_dispatch_sort(q, rq);

2680

++

2681

++	if (bfq_bfqq_sync(bfqq))

2682

++		bfqd->sync_flight++;

2683

++}

2684

++

2685

++/*

2686

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2687

++ */

2688

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2689

++{

2690

++	struct request *rq = NULL;

2691

++

2692

++	if (bfq_bfqq_fifo_expire(bfqq))

2693

++		return NULL;

2694

++

2695

++	bfq_mark_bfqq_fifo_expire(bfqq);

2696

++

2697

++	if (list_empty(&bfqq->fifo))

2698

++		return NULL;

2699

++

2700

++	rq = rq_entry_fifo(bfqq->fifo.next);

2701

++

2702

++	if (time_before(jiffies, rq->fifo_time))

2703

++		return NULL;

2704

++

2705

++	return rq;

2706

++}

2707

++

2708

++/* Must be called with the queue_lock held. */

2709

++static int bfqq_process_refs(struct bfq_queue *bfqq)

2710

++{

2711

++	int process_refs, io_refs;

2712

++

2713

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2714

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2715

++	BUG_ON(process_refs < 0);

2716

++	return process_refs;

2717

++}

2718

++

2719

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2720

++{

2721

++	int process_refs, new_process_refs;

2722

++	struct bfq_queue *__bfqq;

2723

++

2724

++	/*

2725

++	 * If there are no process references on the new_bfqq, then it is

2726

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2727

++	 * may have dropped their last reference (not just their last process

2728

++	 * reference).

2729

++	 */

2730

++	if (!bfqq_process_refs(new_bfqq))

2731

++		return;

2732

++

2733

++	/* Avoid a circular list and skip interim queue merges. */

2734

++	while ((__bfqq = new_bfqq->new_bfqq)) {

2735

++		if (__bfqq == bfqq)

2736

++			return;

2737

++		new_bfqq = __bfqq;

2738

++	}

2739

++

2740

++	process_refs = bfqq_process_refs(bfqq);

2741

++	new_process_refs = bfqq_process_refs(new_bfqq);

2742

++	/*

2743

++	 * If the process for the bfqq has gone away, there is no

2744

++	 * sense in merging the queues.

2745

++	 */

2746

++	if (process_refs == 0 || new_process_refs == 0)

2747

++		return;

2748

++

2749

++	/*

2750

++	 * Merge in the direction of the lesser amount of work.

2751

++	 */

2752

++	if (new_process_refs >= process_refs) {

2753

++		bfqq->new_bfqq = new_bfqq;

2754

++		atomic_add(process_refs, &new_bfqq->ref);

2755

++	} else {

2756

++		new_bfqq->new_bfqq = bfqq;

2757

++		atomic_add(new_process_refs, &bfqq->ref);

2758

++	}

2759

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2760

++		new_bfqq->pid);

2761

++}

2762

++

2763

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2764

++{

2765

++	struct bfq_entity *entity = &bfqq->entity;

2766

++	return entity->budget - entity->service;

2767

++}

2768

++

2769

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2770

++{

2771

++	BUG_ON(bfqq != bfqd->in_service_queue);

2772

++

2773

++	__bfq_bfqd_reset_in_service(bfqd);

2774

++

2775

++	/*

2776

++	 * If this bfqq is shared between multiple processes, check

2777

++	 * to make sure that those processes are still issuing I/Os

2778

++	 * within the mean seek distance. If not, it may be time to

2779

++	 * break the queues apart again.

2780

++	 */

2781

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2782

++		bfq_mark_bfqq_split_coop(bfqq);

2783

++

2784

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2785

++		/*

2786

++		 * Overloading budget_timeout field to store the time

2787

++		 * at which the queue remains with no backlog; used by

2788

++		 * the weight-raising mechanism.

2789

++		 */

2790

++		bfqq->budget_timeout = jiffies;

2791

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2792

++	} else {

2793

++		bfq_activate_bfqq(bfqd, bfqq);

2794

++		/*

2795

++		 * Resort priority tree of potential close cooperators.

2796

++		 */

2797

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2798

++	}

2799

++}

2800

++

2801

++/**

2802

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2803

++ * @bfqd: device data.

2804

++ * @bfqq: queue to update.

2805

++ * @reason: reason for expiration.

2806

++ *

2807

++ * Handle the feedback on @bfqq budget.  See the body for detailed

2808

++ * comments.

2809

++ */

2810

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2811

++				     struct bfq_queue *bfqq,

2812

++				     enum bfqq_expiration reason)

2813

++{

2814

++	struct request *next_rq;

2815

++	unsigned long budget, min_budget;

2816

++

2817

++	budget = bfqq->max_budget;

2818

++	min_budget = bfq_min_budget(bfqd);

2819

++

2820

++	BUG_ON(bfqq != bfqd->in_service_queue);

2821

++

2822

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2823

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2824

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2825

++		budget, bfq_min_budget(bfqd));

2826

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2827

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2828

++

2829

++	if (bfq_bfqq_sync(bfqq)) {

2830

++		switch (reason) {

2831

++		/*

2832

++		 * Caveat: in all the following cases we trade latency

2833

++		 * for throughput.

2834

++		 */

2835

++		case BFQ_BFQQ_TOO_IDLE:

2836

++			/*

2837

++			 * This is the only case where we may reduce

2838

++			 * the budget: if there is no request of the

2839

++			 * process still waiting for completion, then

2840

++			 * we assume (tentatively) that the timer has

2841

++			 * expired because the batch of requests of

2842

++			 * the process could have been served with a

2843

++			 * smaller budget.  Hence, betting that

2844

++			 * process will behave in the same way when it

2845

++			 * becomes backlogged again, we reduce its

2846

++			 * next budget.  As long as we guess right,

2847

++			 * this budget cut reduces the latency

2848

++			 * experienced by the process.

2849

++			 *

2850

++			 * However, if there are still outstanding

2851

++			 * requests, then the process may have not yet

2852

++			 * issued its next request just because it is

2853

++			 * still waiting for the completion of some of

2854

++			 * the still outstanding ones.  So in this

2855

++			 * subcase we do not reduce its budget, on the

2856

++			 * contrary we increase it to possibly boost

2857

++			 * the throughput, as discussed in the

2858

++			 * comments to the BUDGET_TIMEOUT case.

2859

++			 */

2860

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2861

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2862

++			else {

2863

++				if (budget > 5 * min_budget)

2864

++					budget -= 4 * min_budget;

2865

++				else

2866

++					budget = min_budget;

2867

++			}

2868

++			break;

2869

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2870

++			/*

2871

++			 * We double the budget here because: 1) it

2872

++			 * gives the chance to boost the throughput if

2873

++			 * this is not a seeky process (which may have

2874

++			 * bumped into this timeout because of, e.g.,

2875

++			 * ZBR), 2) together with charge_full_budget

2876

++			 * it helps give seeky processes higher

2877

++			 * timestamps, and hence be served less

2878

++			 * frequently.

2879

++			 */

2880

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2881

++			break;

2882

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2883

++			/*

2884

++			 * The process still has backlog, and did not

2885

++			 * let either the budget timeout or the disk

2886

++			 * idling timeout expire. Hence it is not

2887

++			 * seeky, has a short thinktime and may be

2888

++			 * happy with a higher budget too. So

2889

++			 * definitely increase the budget of this good

2890

++			 * candidate to boost the disk throughput.

2891

++			 */

2892

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2893

++			break;

2894

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2895

++		       /*

2896

++			* Leave the budget unchanged.

2897

++			*/

2898

++		default:

2899

++			return;

2900

++		}

2901

++	} else /* async queue */

2902

++	    /* async queues get always the maximum possible budget

2903

++	     * (their ability to dispatch is limited by

2904

++	     * @bfqd->bfq_max_budget_async_rq).

2905

++	     */

2906

++		budget = bfqd->bfq_max_budget;

2907

++

2908

++	bfqq->max_budget = budget;

2909

++

2910

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2911

++	    bfqq->max_budget > bfqd->bfq_max_budget)

2912

++		bfqq->max_budget = bfqd->bfq_max_budget;

2913

++

2914

++	/*

2915

++	 * Make sure that we have enough budget for the next request.

2916

++	 * Since the finish time of the bfqq must be kept in sync with

2917

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2918

++	 * update.

2919

++	 */

2920

++	next_rq = bfqq->next_rq;

2921

++	if (next_rq != NULL)

2922

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2923

++					    bfq_serv_to_charge(next_rq, bfqq));

2924

++	else

2925

++		bfqq->entity.budget = bfqq->max_budget;

2926

++

2927

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2928

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2929

++			bfqq->entity.budget);

2930

++}

2931

++

2932

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2933

++{

2934

++	unsigned long max_budget;

2935

++

2936

++	/*

2937

++	 * The max_budget calculated when autotuning is equal to the

2938

++	 * amount of sectors transfered in timeout_sync at the

2939

++	 * estimated peak rate.

2940

++	 */

2941

++	max_budget = (unsigned long)(peak_rate * 1000 *

2942

++				     timeout >> BFQ_RATE_SHIFT);

2943

++

2944

++	return max_budget;

2945

++}

2946

++

2947

++/*

2948

++ * In addition to updating the peak rate, checks whether the process

2949

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2950

++ * to the budget timeout, to reduce the amount of service provided to

2951

++ * seeky processes, and hence reduce their chances to lower the

2952

++ * throughput. See the code for more details.

2953

++ */

2954

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2955

++				int compensate, enum bfqq_expiration reason)

2956

++{

2957

++	u64 bw, usecs, expected, timeout;

2958

++	ktime_t delta;

2959

++	int update = 0;

2960

++

2961

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2962

++		return 0;

2963

++

2964

++	if (compensate)

2965

++		delta = bfqd->last_idling_start;

2966

++	else

2967

++		delta = ktime_get();

2968

++	delta = ktime_sub(delta, bfqd->last_budget_start);

2969

++	usecs = ktime_to_us(delta);

2970

++

2971

++	/* Don't trust short/unrealistic values. */

2972

++	if (usecs < 100 || usecs >= LONG_MAX)

2973

++		return 0;

2974

++

2975

++	/*

2976

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2977

++	 * value to store the peak rate, in sectors per usec in fixed

2978

++	 * point math.  We do so to have enough precision in the estimate

2979

++	 * and to avoid overflows.

2980

++	 */

2981

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2982

++	do_div(bw, (unsigned long)usecs);

2983

++

2984

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2985

++

2986

++	/*

2987

++	 * Use only long (> 20ms) intervals to filter out spikes for

2988

++	 * the peak rate estimation.

2989

++	 */

2990

++	if (usecs > 20000) {

2991

++		if (bw > bfqd->peak_rate ||

2992

++		   (!BFQQ_SEEKY(bfqq) &&

2993

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2994

++			bfq_log(bfqd, "measured bw =%llu", bw);

2995

++			/*

2996

++			 * To smooth oscillations use a low-pass filter with

2997

++			 * alpha=7/8, i.e.,

2998

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

2999

++			 */

3000

++			do_div(bw, 8);

3001

++			if (bw == 0)

3002

++				return 0;

3003

++			bfqd->peak_rate *= 7;

3004

++			do_div(bfqd->peak_rate, 8);

3005

++			bfqd->peak_rate += bw;

3006

++			update = 1;

3007

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

3008

++		}

3009

++

3010

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

3011

++

3012

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

3013

++			bfqd->peak_rate_samples++;

3014

++

3015

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

3016

++		    update) {

3017

++			int dev_type = blk_queue_nonrot(bfqd->queue);

3018

++			if (bfqd->bfq_user_max_budget == 0) {

3019

++				bfqd->bfq_max_budget =

3020

++					bfq_calc_max_budget(bfqd->peak_rate,

3021

++							    timeout);

3022

++				bfq_log(bfqd, "new max_budget=%lu",

3023

++					bfqd->bfq_max_budget);

3024

++			}

3025

++			if (bfqd->device_speed == BFQ_BFQD_FAST &&

3026

++			    bfqd->peak_rate < device_speed_thresh[dev_type]) {

3027

++				bfqd->device_speed = BFQ_BFQD_SLOW;

3028

++				bfqd->RT_prod = R_slow[dev_type] *

3029

++						T_slow[dev_type];

3030

++			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&

3031

++			    bfqd->peak_rate > device_speed_thresh[dev_type]) {

3032

++				bfqd->device_speed = BFQ_BFQD_FAST;

3033

++				bfqd->RT_prod = R_fast[dev_type] *

3034

++						T_fast[dev_type];

3035

++			}

3036

++		}

3037

++	}

3038

++

3039

++	/*

3040

++	 * If the process has been served for a too short time

3041

++	 * interval to let its possible sequential accesses prevail on

3042

++	 * the initial seek time needed to move the disk head on the

3043

++	 * first sector it requested, then give the process a chance

3044

++	 * and for the moment return false.

3045

++	 */

3046

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

3047

++		return 0;

3048

++

3049

++	/*

3050

++	 * A process is considered ``slow'' (i.e., seeky, so that we

3051

++	 * cannot treat it fairly in the service domain, as it would

3052

++	 * slow down too much the other processes) if, when a slice

3053

++	 * ends for whatever reason, it has received service at a

3054

++	 * rate that would not be high enough to complete the budget

3055

++	 * before the budget timeout expiration.

3056

++	 */

3057

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

3058

++

3059

++	/*

3060

++	 * Caveat: processes doing IO in the slower disk zones will

3061

++	 * tend to be slow(er) even if not seeky. And the estimated

3062

++	 * peak rate will actually be an average over the disk

3063

++	 * surface. Hence, to not be too harsh with unlucky processes,

3064

++	 * we keep a budget/3 margin of safety before declaring a

3065

++	 * process slow.

3066

++	 */

3067

++	return expected > (4 * bfqq->entity.budget) / 3;

3068

++}

3069

++

3070

++/*

3071

++ * To be deemed as soft real-time, an application must meet two

3072

++ * requirements. First, the application must not require an average

3073

++ * bandwidth higher than the approximate bandwidth required to playback or

3074

++ * record a compressed high-definition video.

3075

++ * The next function is invoked on the completion of the last request of a

3076

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

3077

++ * that, if the next request of the application does not arrive before

3078

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

3079

++ *

3080

++ * The second requirement is that the request pattern of the application is

3081

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

3082

++ * the application stops issuing new requests until all its pending requests

3083

++ * have been completed. After that, the application may issue a new batch,

3084

++ * and so on.

3085

++ * For this reason the next function is invoked to compute

3086

++ * soft_rt_next_start only for applications that meet this requirement,

3087

++ * whereas soft_rt_next_start is set to infinity for applications that do

3088

++ * not.

3089

++ *

3090

++ * Unfortunately, even a greedy application may happen to behave in an

3091

++ * isochronous way if the CPU load is high. In fact, the application may

3092

++ * stop issuing requests while the CPUs are busy serving other processes,

3093

++ * then restart, then stop again for a while, and so on. In addition, if

3094

++ * the disk achieves a low enough throughput with the request pattern

3095

++ * issued by the application (e.g., because the request pattern is random

3096

++ * and/or the device is slow), then the application may meet the above

3097

++ * bandwidth requirement too. To prevent such a greedy application to be

3098

++ * deemed as soft real-time, a further rule is used in the computation of

3099

++ * soft_rt_next_start: soft_rt_next_start must be higher than the current

3100

++ * time plus the maximum time for which the arrival of a request is waited

3101

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.

3102

++ * This filters out greedy applications, as the latter issue instead their

3103

++ * next request as soon as possible after the last one has been completed

3104

++ * (in contrast, when a batch of requests is completed, a soft real-time

3105

++ * application spends some time processing data).

3106

++ *

3107

++ * Unfortunately, the last filter may easily generate false positives if

3108

++ * only bfqd->bfq_slice_idle is used as a reference time interval and one

3109

++ * or both the following cases occur:

3110

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

3111

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

3112

++ *    HZ=100.

3113

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

3114

++ *    for a while, then suddenly 'jump' by several units to recover the lost

3115

++ *    increments. This seems to happen, e.g., inside virtual machines.

3116

++ * To address this issue, we do not use as a reference time interval just

3117

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

3118

++ * particular we add the minimum number of jiffies for which the filter

3119

++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual

3120

++ * machines.

3121

++ */

3122

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

3123

++						       struct bfq_queue *bfqq)

3124

++{

3125

++	return max(bfqq->last_idle_bklogged +

3126

++		   HZ * bfqq->service_from_backlogged /

3127

++		   bfqd->bfq_wr_max_softrt_rate,

3128

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

3129

++}

3130

++

3131

++/*

3132

++ * Return the largest-possible time instant such that, for as long as possible,

3133

++ * the current time will be lower than this time instant according to the macro

3134

++ * time_is_before_jiffies().

3135

++ */

3136

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

3137

++{

3138

++	return now + ULONG_MAX / 2;

3139

++}

3140

++

3141

++/**

3142

++ * bfq_bfqq_expire - expire a queue.

3143

++ * @bfqd: device owning the queue.

3144

++ * @bfqq: the queue to expire.

3145

++ * @compensate: if true, compensate for the time spent idling.

3146

++ * @reason: the reason causing the expiration.

3147

++ *

3148

++ *

3149

++ * If the process associated to the queue is slow (i.e., seeky), or in

3150

++ * case of budget timeout, or, finally, if it is async, we

3151

++ * artificially charge it an entire budget (independently of the

3152

++ * actual service it received). As a consequence, the queue will get

3153

++ * higher timestamps than the correct ones upon reactivation, and

3154

++ * hence it will be rescheduled as if it had received more service

3155

++ * than what it actually received. In the end, this class of processes

3156

++ * will receive less service in proportion to how slowly they consume

3157

++ * their budgets (and hence how seriously they tend to lower the

3158

++ * throughput).

3159

++ *

3160

++ * In contrast, when a queue expires because it has been idling for

3161

++ * too much or because it exhausted its budget, we do not touch the

3162

++ * amount of service it has received. Hence when the queue will be

3163

++ * reactivated and its timestamps updated, the latter will be in sync

3164

++ * with the actual service received by the queue until expiration.

3165

++ *

3166

++ * Charging a full budget to the first type of queues and the exact

3167

++ * service to the others has the effect of using the WF2Q+ policy to

3168

++ * schedule the former on a timeslice basis, without violating the

3169

++ * service domain guarantees of the latter.

3170

++ */

3171

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

3172

++			    struct bfq_queue *bfqq,

3173

++			    int compensate,

3174

++			    enum bfqq_expiration reason)

3175

++{

3176

++	int slow;

3177

++	BUG_ON(bfqq != bfqd->in_service_queue);

3178

++

3179

++	/* Update disk peak rate for autotuning and check whether the

3180

++	 * process is slow (see bfq_update_peak_rate).

3181

++	 */

3182

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

3183

++

3184

++	/*

3185

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

3186

++	 * and async queues, to favor sequential sync workloads.

3187

++	 *

3188

++	 * Processes doing I/O in the slower disk zones will tend to be

3189

++	 * slow(er) even if not seeky. Hence, since the estimated peak

3190

++	 * rate is actually an average over the disk surface, these

3191

++	 * processes may timeout just for bad luck. To avoid punishing

3192

++	 * them we do not charge a full budget to a process that

3193

++	 * succeeded in consuming at least 2/3 of its budget.

3194

++	 */

3195

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3196

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

3197

++		bfq_bfqq_charge_full_budget(bfqq);

3198

++

3199

++	bfqq->service_from_backlogged += bfqq->entity.service;

3200

++

3201

++	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3202

++	    !bfq_bfqq_constantly_seeky(bfqq)) {

3203

++		bfq_mark_bfqq_constantly_seeky(bfqq);

3204

++		if (!blk_queue_nonrot(bfqd->queue))

3205

++			bfqd->const_seeky_busy_in_flight_queues++;

3206

++	}

3207

++

3208

++	if (reason == BFQ_BFQQ_TOO_IDLE &&

3209

++	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

3210

++		bfq_clear_bfqq_IO_bound(bfqq);

3211

++

3212

++	if (bfqd->low_latency && bfqq->wr_coeff == 1)

3213

++		bfqq->last_wr_start_finish = jiffies;

3214

++

3215

++	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

3216

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

3217

++		/*

3218

++		 * If we get here, and there are no outstanding requests,

3219

++		 * then the request pattern is isochronous (see the comments

3220

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we

3221

++		 * can compute soft_rt_next_start. If, instead, the queue

3222

++		 * still has outstanding requests, then we have to wait

3223

++		 * for the completion of all the outstanding requests to

3224

++		 * discover whether the request pattern is actually

3225

++		 * isochronous.

3226

++		 */

3227

++		if (bfqq->dispatched == 0)

3228

++			bfqq->soft_rt_next_start =

3229

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

3230

++		else {

3231

++			/*

3232

++			 * The application is still waiting for the

3233

++			 * completion of one or more requests:

3234

++			 * prevent it from possibly being incorrectly

3235

++			 * deemed as soft real-time by setting its

3236

++			 * soft_rt_next_start to infinity. In fact,

3237

++			 * without this assignment, the application

3238

++			 * would be incorrectly deemed as soft

3239

++			 * real-time if:

3240

++			 * 1) it issued a new request before the

3241

++			 *    completion of all its in-flight

3242

++			 *    requests, and

3243

++			 * 2) at that time, its soft_rt_next_start

3244

++			 *    happened to be in the past.

3245

++			 */

3246

++			bfqq->soft_rt_next_start =

3247

++				bfq_infinity_from_now(jiffies);

3248

++			/*

3249

++			 * Schedule an update of soft_rt_next_start to when

3250

++			 * the task may be discovered to be isochronous.

3251

++			 */

3252

++			bfq_mark_bfqq_softrt_update(bfqq);

3253

++		}

3254

++	}

3255

++

3256

++	bfq_log_bfqq(bfqd, bfqq,

3257

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

3258

++		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

3259

++

3260

++	/*

3261

++	 * Increase, decrease or leave budget unchanged according to

3262

++	 * reason.

3263

++	 */

3264

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

3265

++	__bfq_bfqq_expire(bfqd, bfqq);

3266

++}

3267

++

3268

++/*

3269

++ * Budget timeout is not implemented through a dedicated timer, but

3270

++ * just checked on request arrivals and completions, as well as on

3271

++ * idle timer expirations.

3272

++ */

3273

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

3274

++{

3275

++	if (bfq_bfqq_budget_new(bfqq) ||

3276

++	    time_before(jiffies, bfqq->budget_timeout))

3277

++		return 0;

3278

++	return 1;

3279

++}

3280

++

3281

++/*

3282

++ * If we expire a queue that is waiting for the arrival of a new

3283

++ * request, we may prevent the fictitious timestamp back-shifting that

3284

++ * allows the guarantees of the queue to be preserved (see [1] for

3285

++ * this tricky aspect). Hence we return true only if this condition

3286

++ * does not hold, or if the queue is slow enough to deserve only to be

3287

++ * kicked off for preserving a high throughput.

3288

++*/

3289

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

3290

++{

3291

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

3292

++		"may_budget_timeout: wait_request %d left %d timeout %d",

3293

++		bfq_bfqq_wait_request(bfqq),

3294

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

3295

++		bfq_bfqq_budget_timeout(bfqq));

3296

++

3297

++	return (!bfq_bfqq_wait_request(bfqq) ||

3298

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

3299

++		&&

3300

++		bfq_bfqq_budget_timeout(bfqq);

3301

++}

3302

++

3303

++/*

3304

++ * Device idling is allowed only for the queues for which this function

3305

++ * returns true. For this reason, the return value of this function plays a

3306

++ * critical role for both throughput boosting and service guarantees. The

3307

++ * return value is computed through a logical expression. In this rather

3308

++ * long comment, we try to briefly describe all the details and motivations

3309

++ * behind the components of this logical expression.

3310

++ *

3311

++ * First, the expression is false if bfqq is not sync, or if: bfqq happened

3312

++ * to become active during a large burst of queue activations, and the

3313

++ * pattern of requests bfqq contains boosts the throughput if bfqq is

3314

++ * expired. In fact, queues that became active during a large burst benefit

3315

++ * only from throughput, as discussed in the comments to bfq_handle_burst.

3316

++ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-

3317

++ * capable flash-based devices, whereas, on rotational devices, it boosts

3318

++ * the throughput only if bfqq contains random requests.

3319

++ *

3320

++ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related

3321

++ * condition does not hold, and (c) bfqq is being weight-raised, then the

3322

++ * expression always evaluates to true, as device idling is instrumental

3323

++ * for preserving low-latency guarantees (see [1]). If, instead, conditions

3324

++ * (a) and (b) do hold, but (c) does not, then the expression evaluates to

3325

++ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and

3326

++ * (2) at least one of the following two conditions holds.

3327

++ * The first condition is that the device is not performing NCQ, because

3328

++ * idling the device most certainly boosts the throughput if this condition

3329

++ * holds and bfqq is I/O-bound and has been granted a non-null idle window.

3330

++ * The second compound condition is made of the logical AND of two components.

3331

++ *

3332

++ * The first component is true only if there is no weight-raised busy

3333

++ * queue. This guarantees that the device is not idled for a sync non-

3334

++ * weight-raised queue when there are busy weight-raised queues. The former

3335

++ * is then expired immediately if empty. Combined with the timestamping

3336

++ * rules of BFQ (see [1] for details), this causes sync non-weight-raised

3337

++ * queues to get a lower number of requests served, and hence to ask for a

3338

++ * lower number of requests from the request pool, before the busy weight-

3339

++ * raised queues get served again.

3340

++ *

3341

++ * This is beneficial for the processes associated with weight-raised

3342

++ * queues, when the request pool is saturated (e.g., in the presence of

3343

++ * write hogs). In fact, if the processes associated with the other queues

3344

++ * ask for requests at a lower rate, then weight-raised processes have a

3345

++ * higher probability to get a request from the pool immediately (or at

3346

++ * least soon) when they need one. Hence they have a higher probability to

3347

++ * actually get a fraction of the disk throughput proportional to their

3348

++ * high weight. This is especially true with NCQ-capable drives, which

3349

++ * enqueue several requests in advance and further reorder internally-

3350

++ * queued requests.

3351

++ *

3352

++ * In the end, mistreating non-weight-raised queues when there are busy

3353

++ * weight-raised queues seems to mitigate starvation problems in the

3354

++ * presence of heavy write workloads and NCQ, and hence to guarantee a

3355

++ * higher application and system responsiveness in these hostile scenarios.

3356

++ *

3357

++ * If the first component of the compound condition is instead true, i.e.,

3358

++ * there is no weight-raised busy queue, then the second component of the

3359

++ * compound condition takes into account service-guarantee and throughput

3360

++ * issues related to NCQ (recall that the compound condition is evaluated

3361

++ * only if the device is detected as supporting NCQ).

3362

++ *

3363

++ * As for service guarantees, allowing the drive to enqueue more than one

3364

++ * request at a time, and hence delegating de facto final scheduling

3365

++ * decisions to the drive's internal scheduler, causes loss of control on

3366

++ * the actual request service order. In this respect, when the drive is

3367

++ * allowed to enqueue more than one request at a time, the service

3368

++ * distribution enforced by the drive's internal scheduler is likely to

3369

++ * coincide with the desired device-throughput distribution only in the

3370

++ * following, perfectly symmetric, scenario:

3371

++ * 1) all active queues have the same weight,

3372

++ * 2) all active groups at the same level in the groups tree have the same

3373

++ *    weight,

3374

++ * 3) all active groups at the same level in the groups tree have the same

3375

++ *    number of children.

3376

++ *

3377

++ * Even in such a scenario, sequential I/O may still receive a preferential

3378

++ * treatment, but this is not likely to be a big issue with flash-based

3379

++ * devices, because of their non-dramatic loss of throughput with random

3380

++ * I/O. Things do differ with HDDs, for which additional care is taken, as

3381

++ * explained after completing the discussion for flash-based devices.

3382

++ *

3383

++ * Unfortunately, keeping the necessary state for evaluating exactly the

3384

++ * above symmetry conditions would be quite complex and time-consuming.

3385

++ * Therefore BFQ evaluates instead the following stronger sub-conditions,

3386

++ * for which it is much easier to maintain the needed state:

3387

++ * 1) all active queues have the same weight,

3388

++ * 2) all active groups have the same weight,

3389

++ * 3) all active groups have at most one active child each.

3390

++ * In particular, the last two conditions are always true if hierarchical

3391

++ * support and the cgroups interface are not enabled, hence no state needs

3392

++ * to be maintained in this case.

3393

++ *

3394

++ * According to the above considerations, the second component of the

3395

++ * compound condition evaluates to true if any of the above symmetry

3396

++ * sub-condition does not hold, or the device is not flash-based. Therefore,

3397

++ * if also the first component is true, then idling is allowed for a sync

3398

++ * queue. These are the only sub-conditions considered if the device is

3399

++ * flash-based, as, for such a device, it is sensible to force idling only

3400

++ * for service-guarantee issues. In fact, as for throughput, idling

3401

++ * NCQ-capable flash-based devices would not boost the throughput even

3402

++ * with sequential I/O; rather it would lower the throughput in proportion

3403

++ * to how fast the device is. In the end, (only) if all the three

3404

++ * sub-conditions hold and the device is flash-based, the compound

3405

++ * condition evaluates to false and therefore no idling is performed.

3406

++ *

3407

++ * As already said, things change with a rotational device, where idling

3408

++ * boosts the throughput with sequential I/O (even with NCQ). Hence, for

3409

++ * such a device the second component of the compound condition evaluates

3410

++ * to true also if the following additional sub-condition does not hold:

3411

++ * the queue is constantly seeky. Unfortunately, this different behavior

3412

++ * with respect to flash-based devices causes an additional asymmetry: if

3413

++ * some sync queues enjoy idling and some other sync queues do not, then

3414

++ * the latter get a low share of the device throughput, simply because the

3415

++ * former get many requests served after being set as in service, whereas

3416

++ * the latter do not. As a consequence, to guarantee the desired throughput

3417

++ * distribution, on HDDs the compound expression evaluates to true (and

3418

++ * hence device idling is performed) also if the following last symmetry

3419

++ * condition does not hold: no other queue is benefiting from idling. Also

3420

++ * this last condition is actually replaced with a simpler-to-maintain and

3421

++ * stronger condition: there is no busy queue which is not constantly seeky

3422

++ * (and hence may also benefit from idling).

3423

++ *

3424

++ * To sum up, when all the required symmetry and throughput-boosting

3425

++ * sub-conditions hold, the second component of the compound condition

3426

++ * evaluates to false, and hence no idling is performed. This helps to

3427

++ * keep the drives' internal queues full on NCQ-capable devices, and hence

3428

++ * to boost the throughput, without causing 'almost' any loss of service

3429

++ * guarantees. The 'almost' follows from the fact that, if the internal

3430

++ * queue of one such device is filled while all the sub-conditions hold,

3431

++ * but at some point in time some sub-condition stops to hold, then it may

3432

++ * become impossible to let requests be served in the new desired order

3433

++ * until all the requests already queued in the device have been served.

3434

++ */

3435

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

3436

++{

3437

++	struct bfq_data *bfqd = bfqq->bfqd;

3438

++#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \

3439

++				   bfqd->busy_in_flight_queues == \

3440

++				   bfqd->const_seeky_busy_in_flight_queues)

3441

++

3442

++#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \

3443

++					 bfqd->hw_tag && \

3444

++					 (blk_queue_nonrot(bfqd->queue) || \

3445

++					  bfq_bfqq_constantly_seeky(bfqq)))

3446

++

3447

++/*

3448

++ * Condition for expiring a non-weight-raised queue (and hence not idling

3449

++ * the device).

3450

++ */

3451

++#define cond_for_expiring_non_wr  (bfqd->hw_tag && \

3452

++				   (bfqd->wr_busy_queues > 0 || \

3453

++				    (blk_queue_nonrot(bfqd->queue) || \

3454

++				      cond_for_seeky_on_ncq_hdd)))

3455

++

3456

++	return bfq_bfqq_sync(bfqq) &&

3457

++		!cond_for_expiring_in_burst &&

3458

++		(bfqq->wr_coeff > 1 || !symmetric_scenario ||

3459

++		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&

3460

++		  !cond_for_expiring_non_wr)

3461

++	);

3462

++}

3463

++

3464

++/*

3465

++ * If the in-service queue is empty but sync, and the function

3466

++ * bfq_bfqq_must_not_expire returns true, then:

3467

++ * 1) the queue must remain in service and cannot be expired, and

3468

++ * 2) the disk must be idled to wait for the possible arrival of a new

3469

++ *    request for the queue.

3470

++ * See the comments to the function bfq_bfqq_must_not_expire for the reasons

3471

++ * why performing device idling is the best choice to boost the throughput

3472

++ * and preserve service guarantees when bfq_bfqq_must_not_expire itself

3473

++ * returns true.

3474

++ */

3475

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3476

++{

3477

++	struct bfq_data *bfqd = bfqq->bfqd;

3478

++

3479

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3480

++	       bfq_bfqq_must_not_expire(bfqq);

3481

++}

3482

++

3483

++/*

3484

++ * Select a queue for service.  If we have a current queue in service,

3485

++ * check whether to continue servicing it, or retrieve and set a new one.

3486

++ */

3487

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3488

++{

3489

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3490

++	struct request *next_rq;

3491

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3492

++

3493

++	bfqq = bfqd->in_service_queue;

3494

++	if (bfqq == NULL)

3495

++		goto new_queue;

3496

++

3497

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3498

++

3499

++	/*

3500

++         * If another queue has a request waiting within our mean seek

3501

++         * distance, let it run. The expire code will check for close

3502

++         * cooperators and put the close queue at the front of the

3503

++         * service tree. If possible, merge the expiring queue with the

3504

++         * new bfqq.

3505

++         */

3506

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3507

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3508

++                bfq_setup_merge(bfqq, new_bfqq);

3509

++

3510

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3511

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3512

++	    !bfq_bfqq_must_idle(bfqq))

3513

++		goto expire;

3514

++

3515

++	next_rq = bfqq->next_rq;

3516

++	/*

3517

++	 * If bfqq has requests queued and it has enough budget left to

3518

++	 * serve them, keep the queue, otherwise expire it.

3519

++	 */

3520

++	if (next_rq != NULL) {

3521

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3522

++			bfq_bfqq_budget_left(bfqq)) {

3523

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3524

++			goto expire;

3525

++		} else {

3526

++			/*

3527

++			 * The idle timer may be pending because we may

3528

++			 * not disable disk idling even when a new request

3529

++			 * arrives.

3530

++			 */

3531

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3532

++				/*

3533

++				 * If we get here: 1) at least a new request

3534

++				 * has arrived but we have not disabled the

3535

++				 * timer because the request was too small,

3536

++				 * 2) then the block layer has unplugged

3537

++				 * the device, causing the dispatch to be

3538

++				 * invoked.

3539

++				 *

3540

++				 * Since the device is unplugged, now the

3541

++				 * requests are probably large enough to

3542

++				 * provide a reasonable throughput.

3543

++				 * So we disable idling.

3544

++				 */

3545

++				bfq_clear_bfqq_wait_request(bfqq);

3546

++				del_timer(&bfqd->idle_slice_timer);

3547

++			}

3548

++			if (new_bfqq == NULL)

3549

++				goto keep_queue;

3550

++			else

3551

++				goto expire;

3552

++		}

3553

++	}

3554

++

3555

++	/*

3556

++	 * No requests pending. However, if the in-service queue is idling

3557

++	 * for a new request, or has requests waiting for a completion and

3558

++	 * may idle after their completion, then keep it anyway.

3559

++	 */

3560

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3561

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3562

++		bfqq = NULL;

3563

++		goto keep_queue;

3564

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3565

++		/*

3566

++		 * Expiring the queue because there is a close cooperator,

3567

++		 * cancel timer.

3568

++		 */

3569

++		bfq_clear_bfqq_wait_request(bfqq);

3570

++		del_timer(&bfqd->idle_slice_timer);

3571

++	}

3572

++

3573

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3574

++expire:

3575

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3576

++new_queue:

3577

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3578

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3579

++		bfqq != NULL ? bfqq->pid : 0);

3580

++keep_queue:

3581

++	return bfqq;

3582

++}

3583

++

3584

++static void bfq_update_wr_data(struct bfq_data *bfqd,

3585

++			       struct bfq_queue *bfqq)

3586

++{

3587

++	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

3588

++		struct bfq_entity *entity = &bfqq->entity;

3589

++

3590

++		bfq_log_bfqq(bfqd, bfqq,

3591

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

3592

++			jiffies_to_msecs(jiffies -

3593

++				bfqq->last_wr_start_finish),

3594

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

3595

++			bfqq->wr_coeff,

3596

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3597

++

3598

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3599

++		       entity->orig_weight * bfqq->wr_coeff);

3600

++		if (entity->ioprio_changed)

3601

++			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

3602

++		/*

3603

++		 * If the queue was activated in a burst, or

3604

++		 * too much time has elapsed from the beginning

3605

++		 * of this weight-raising, then end weight raising.

3606

++		 */

3607

++		if (bfq_bfqq_in_large_burst(bfqq) ||

3608

++		    time_is_before_jiffies(bfqq->last_wr_start_finish +

3609

++					   bfqq->wr_cur_max_time)) {

3610

++			bfqq->last_wr_start_finish = jiffies;

3611

++			bfq_log_bfqq(bfqd, bfqq,

3612

++				     "wrais ending at %lu, rais_max_time %u",

3613

++				     bfqq->last_wr_start_finish,

3614

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

3615

++			bfq_bfqq_end_wr(bfqq);

3616

++			__bfq_entity_update_weight_prio(

3617

++				bfq_entity_service_tree(entity),

3618

++				entity);

3619

++		}

3620

++	}

3621

++}

3622

++

3623

++/*

3624

++ * Dispatch one request from bfqq, moving it to the request queue

3625

++ * dispatch list.

3626

++ */

3627

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3628

++				struct bfq_queue *bfqq)

3629

++{

3630

++	int dispatched = 0;

3631

++	struct request *rq;

3632

++	unsigned long service_to_charge;

3633

++

3634

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3635

++

3636

++	/* Follow expired path, else get first next available. */

3637

++	rq = bfq_check_fifo(bfqq);

3638

++	if (rq == NULL)

3639

++		rq = bfqq->next_rq;

3640

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3641

++

3642

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3643

++		/*

3644

++		 * This may happen if the next rq is chosen in fifo order

3645

++		 * instead of sector order. The budget is properly

3646

++		 * dimensioned to be always sufficient to serve the next

3647

++		 * request only if it is chosen in sector order. The reason

3648

++		 * is that it would be quite inefficient and little useful

3649

++		 * to always make sure that the budget is large enough to

3650

++		 * serve even the possible next rq in fifo order.

3651

++		 * In fact, requests are seldom served in fifo order.

3652

++		 *

3653

++		 * Expire the queue for budget exhaustion, and make sure

3654

++		 * that the next act_budget is enough to serve the next

3655

++		 * request, even if it comes from the fifo expired path.

3656

++		 */

3657

++		bfqq->next_rq = rq;

3658

++		/*

3659

++		 * Since this dispatch is failed, make sure that

3660

++		 * a new one will be performed

3661

++		 */

3662

++		if (!bfqd->rq_in_driver)

3663

++			bfq_schedule_dispatch(bfqd);

3664

++		goto expire;

3665

++	}

3666

++

3667

++	/* Finally, insert request into driver dispatch list. */

3668

++	bfq_bfqq_served(bfqq, service_to_charge);

3669

++	bfq_dispatch_insert(bfqd->queue, rq);

3670

++

3671

++	bfq_update_wr_data(bfqd, bfqq);

3672

++

3673

++	bfq_log_bfqq(bfqd, bfqq,

3674

++			"dispatched %u sec req (%llu), budg left %lu",

3675

++			blk_rq_sectors(rq),

3676

++			(long long unsigned)blk_rq_pos(rq),

3677

++			bfq_bfqq_budget_left(bfqq));

3678

++

3679

++	dispatched++;

3680

++

3681

++	if (bfqd->in_service_bic == NULL) {

3682

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3683

++		bfqd->in_service_bic = RQ_BIC(rq);

3684

++	}

3685

++

3686

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3687

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3688

++	    bfq_class_idle(bfqq)))

3689

++		goto expire;

3690

++

3691

++	return dispatched;

3692

++

3693

++expire:

3694

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3695

++	return dispatched;

3696

++}

3697

++

3698

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3699

++{

3700

++	int dispatched = 0;

3701

++

3702

++	while (bfqq->next_rq != NULL) {

3703

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3704

++		dispatched++;

3705

++	}

3706

++

3707

++	BUG_ON(!list_empty(&bfqq->fifo));

3708

++	return dispatched;

3709

++}

3710

++

3711

++/*

3712

++ * Drain our current requests.

3713

++ * Used for barriers and when switching io schedulers on-the-fly.

3714

++ */

3715

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3716

++{

3717

++	struct bfq_queue *bfqq, *n;

3718

++	struct bfq_service_tree *st;

3719

++	int dispatched = 0;

3720

++

3721

++	bfqq = bfqd->in_service_queue;

3722

++	if (bfqq != NULL)

3723

++		__bfq_bfqq_expire(bfqd, bfqq);

3724

++

3725

++	/*

3726

++	 * Loop through classes, and be careful to leave the scheduler

3727

++	 * in a consistent state, as feedback mechanisms and vtime

3728

++	 * updates cannot be disabled during the process.

3729

++	 */

3730

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3731

++		st = bfq_entity_service_tree(&bfqq->entity);

3732

++

3733

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3734

++		bfqq->max_budget = bfq_max_budget(bfqd);

3735

++

3736

++		bfq_forget_idle(st);

3737

++	}

3738

++

3739

++	BUG_ON(bfqd->busy_queues != 0);

3740

++

3741

++	return dispatched;

3742

++}

3743

++

3744

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3745

++{

3746

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3747

++	struct bfq_queue *bfqq;

3748

++	int max_dispatch;

3749

++

3750

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3751

++	if (bfqd->busy_queues == 0)

3752

++		return 0;

3753

++

3754

++	if (unlikely(force))

3755

++		return bfq_forced_dispatch(bfqd);

3756

++

3757

++	bfqq = bfq_select_queue(bfqd);

3758

++	if (bfqq == NULL)

3759

++		return 0;

3760

++

3761

++	if (bfq_class_idle(bfqq))

3762

++		max_dispatch = 1;

3763

++

3764

++	if (!bfq_bfqq_sync(bfqq))

3765

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3766

++

3767

++	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {

3768

++		if (bfqd->busy_queues > 1)

3769

++			return 0;

3770

++		if (bfqq->dispatched >= 4 * max_dispatch)

3771

++			return 0;

3772

++	}

3773

++

3774

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3775

++		return 0;

3776

++

3777

++	bfq_clear_bfqq_wait_request(bfqq);

3778

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3779

++

3780

++	if (!bfq_dispatch_request(bfqd, bfqq))

3781

++		return 0;

3782

++

3783

++	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",

3784

++			bfq_bfqq_sync(bfqq) ? "sync" : "async");

3785

++

3786

++	return 1;

3787

++}

3788

++

3789

++/*

3790

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3791

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3792

++ *

3793

++ * Queue lock must be held here.

3794

++ */

3795

++static void bfq_put_queue(struct bfq_queue *bfqq)

3796

++{

3797

++	struct bfq_data *bfqd = bfqq->bfqd;

3798

++

3799

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3800

++

3801

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3802

++		     atomic_read(&bfqq->ref));

3803

++	if (!atomic_dec_and_test(&bfqq->ref))

3804

++		return;

3805

++

3806

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3807

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3808

++	BUG_ON(bfqq->entity.tree != NULL);

3809

++	BUG_ON(bfq_bfqq_busy(bfqq));

3810

++	BUG_ON(bfqd->in_service_queue == bfqq);

3811

++

3812

++	if (bfq_bfqq_sync(bfqq))

3813

++		/*

3814

++		 * The fact that this queue is being destroyed does not

3815

++		 * invalidate the fact that this queue may have been

3816

++		 * activated during the current burst. As a consequence,

3817

++		 * although the queue does not exist anymore, and hence

3818

++		 * needs to be removed from the burst list if there,

3819

++		 * the burst size has not to be decremented.

3820

++		 */

3821

++		hlist_del_init(&bfqq->burst_list_node);

3822

++

3823

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3824

++

3825

++	kmem_cache_free(bfq_pool, bfqq);

3826

++}

3827

++

3828

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

3829

++{

3830

++	struct bfq_queue *__bfqq, *next;

3831

++

3832

++	/*

3833

++	 * If this queue was scheduled to merge with another queue, be

3834

++	 * sure to drop the reference taken on that queue (and others in

3835

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3836

++	 */

3837

++	__bfqq = bfqq->new_bfqq;

3838

++	while (__bfqq) {

3839

++		if (__bfqq == bfqq)

3840

++			break;

3841

++		next = __bfqq->new_bfqq;

3842

++		bfq_put_queue(__bfqq);

3843

++		__bfqq = next;

3844

++	}

3845

++}

3846

++

3847

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3848

++{

3849

++	if (bfqq == bfqd->in_service_queue) {

3850

++		__bfq_bfqq_expire(bfqd, bfqq);

3851

++		bfq_schedule_dispatch(bfqd);

3852

++	}

3853

++

3854

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3855

++		     atomic_read(&bfqq->ref));

3856

++

3857

++	bfq_put_cooperator(bfqq);

3858

++

3859

++	bfq_put_queue(bfqq);

3860

++}

3861

++

3862

++static inline void bfq_init_icq(struct io_cq *icq)

3863

++{

3864

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3865

++

3866

++	bic->ttime.last_end_request = jiffies;

3867

++}

3868

++

3869

++static void bfq_exit_icq(struct io_cq *icq)

3870

++{

3871

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3872

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3873

++

3874

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3875

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3876

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3877

++	}

3878

++

3879

++	if (bic->bfqq[BLK_RW_SYNC]) {

3880

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3881

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3882

++	}

3883

++}

3884

++

3885

++/*

3886

++ * Update the entity prio values; note that the new values will not

3887

++ * be used until the next (re)activation.

3888

++ */

3889

++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3890

++{

3891

++	struct task_struct *tsk = current;

3892

++	int ioprio_class;

3893

++

3894

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3895

++	switch (ioprio_class) {

3896

++	default:

3897

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3898

++			"bfq: bad prio class %d\n", ioprio_class);

3899

++	case IOPRIO_CLASS_NONE:

3900

++		/*

3901

++		 * No prio set, inherit CPU scheduling settings.

3902

++		 */

3903

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3904

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3905

++		break;

3906

++	case IOPRIO_CLASS_RT:

3907

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3908

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3909

++		break;

3910

++	case IOPRIO_CLASS_BE:

3911

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3912

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3913

++		break;

3914

++	case IOPRIO_CLASS_IDLE:

3915

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3916

++		bfqq->entity.new_ioprio = 7;

3917

++		bfq_clear_bfqq_idle_window(bfqq);

3918

++		break;

3919

++	}

3920

++

3921

++	if (bfqq->entity.new_ioprio < 0 ||

3922

++	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {

3923

++		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",

3924

++				 bfqq->entity.new_ioprio);

3925

++		BUG();

3926

++	}

3927

++

3928

++	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);

3929

++	bfqq->entity.ioprio_changed = 1;

3930

++}

3931

++

3932

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic)

3933

++{

3934

++	struct bfq_data *bfqd;

3935

++	struct bfq_queue *bfqq, *new_bfqq;

3936

++	struct bfq_group *bfqg;

3937

++	unsigned long uninitialized_var(flags);

3938

++	int ioprio = bic->icq.ioc->ioprio;

3939

++

3940

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3941

++				   &flags);

3942

++	/*

3943

++	 * This condition may trigger on a newly created bic, be sure to

3944

++	 * drop the lock before returning.

3945

++	 */

3946

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3947

++		goto out;

3948

++

3949

++	bic->ioprio = ioprio;

3950

++

3951

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

3952

++	if (bfqq != NULL) {

3953

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3954

++				    sched_data);

3955

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3956

++					 GFP_ATOMIC);

3957

++		if (new_bfqq != NULL) {

3958

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3959

++			bfq_log_bfqq(bfqd, bfqq,

3960

++				     "check_ioprio_change: bfqq %p %d",

3961

++				     bfqq, atomic_read(&bfqq->ref));

3962

++			bfq_put_queue(bfqq);

3963

++		}

3964

++	}

3965

++

3966

++	bfqq = bic->bfqq[BLK_RW_SYNC];

3967

++	if (bfqq != NULL)

3968

++		bfq_set_next_ioprio_data(bfqq, bic);

3969

++

3970

++out:

3971

++	bfq_put_bfqd_unlock(bfqd, &flags);

3972

++}

3973

++

3974

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3975

++			  struct bfq_io_cq *bic, pid_t pid, int is_sync)

3976

++{

3977

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3978

++	INIT_LIST_HEAD(&bfqq->fifo);

3979

++	INIT_HLIST_NODE(&bfqq->burst_list_node);

3980

++

3981

++	atomic_set(&bfqq->ref, 0);

3982

++	bfqq->bfqd = bfqd;

3983

++

3984

++	if (bic)

3985

++		bfq_set_next_ioprio_data(bfqq, bic);

3986

++

3987

++	if (is_sync) {

3988

++		if (!bfq_class_idle(bfqq))

3989

++			bfq_mark_bfqq_idle_window(bfqq);

3990

++		bfq_mark_bfqq_sync(bfqq);

3991

++	}

3992

++	bfq_mark_bfqq_IO_bound(bfqq);

3993

++

3994

++	/* Tentative initial value to trade off between thr and lat */

3995

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3996

++	bfqq->pid = pid;

3997

++

3998

++	bfqq->wr_coeff = 1;

3999

++	bfqq->last_wr_start_finish = 0;

4000

++	/*

4001

++	 * Set to the value for which bfqq will not be deemed as

4002

++	 * soft rt when it becomes backlogged.

4003

++	 */

4004

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

4005

++}

4006

++

4007

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

4008

++					      struct bfq_group *bfqg,

4009

++					      int is_sync,

4010

++					      struct bfq_io_cq *bic,

4011

++					      gfp_t gfp_mask)

4012

++{

4013

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

4014

++

4015

++retry:

4016

++	/* bic always exists here */

4017

++	bfqq = bic_to_bfqq(bic, is_sync);

4018

++

4019

++	/*

4020

++	 * Always try a new alloc if we fall back to the OOM bfqq

4021

++	 * originally, since it should just be a temporary situation.

4022

++	 */

4023

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4024

++		bfqq = NULL;

4025

++		if (new_bfqq != NULL) {

4026

++			bfqq = new_bfqq;

4027

++			new_bfqq = NULL;

4028

++		} else if (gfp_mask & __GFP_WAIT) {

4029

++			spin_unlock_irq(bfqd->queue->queue_lock);

4030

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

4031

++					gfp_mask | __GFP_ZERO,

4032

++					bfqd->queue->node);

4033

++			spin_lock_irq(bfqd->queue->queue_lock);

4034

++			if (new_bfqq != NULL)

4035

++				goto retry;

4036

++		} else {

4037

++			bfqq = kmem_cache_alloc_node(bfq_pool,

4038

++					gfp_mask | __GFP_ZERO,

4039

++					bfqd->queue->node);

4040

++		}

4041

++

4042

++		if (bfqq != NULL) {

4043

++			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

4044

++                                      is_sync);

4045

++			bfq_init_entity(&bfqq->entity, bfqg);

4046

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

4047

++		} else {

4048

++			bfqq = &bfqd->oom_bfqq;

4049

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

4050

++		}

4051

++	}

4052

++

4053

++	if (new_bfqq != NULL)

4054

++		kmem_cache_free(bfq_pool, new_bfqq);

4055

++

4056

++	return bfqq;

4057

++}

4058

++

4059

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

4060

++					       struct bfq_group *bfqg,

4061

++					       int ioprio_class, int ioprio)

4062

++{

4063

++	switch (ioprio_class) {

4064

++	case IOPRIO_CLASS_RT:

4065

++		return &bfqg->async_bfqq[0][ioprio];

4066

++	case IOPRIO_CLASS_NONE:

4067

++		ioprio = IOPRIO_NORM;

4068

++		/* fall through */

4069

++	case IOPRIO_CLASS_BE:

4070

++		return &bfqg->async_bfqq[1][ioprio];

4071

++	case IOPRIO_CLASS_IDLE:

4072

++		return &bfqg->async_idle_bfqq;

4073

++	default:

4074

++		BUG();

4075

++	}

4076

++}

4077

++

4078

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

4079

++				       struct bfq_group *bfqg, int is_sync,

4080

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

4081

++{

4082

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4083

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4084

++	struct bfq_queue **async_bfqq = NULL;

4085

++	struct bfq_queue *bfqq = NULL;

4086

++

4087

++	if (!is_sync) {

4088

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

4089

++						  ioprio);

4090

++		bfqq = *async_bfqq;

4091

++	}

4092

++

4093

++	if (bfqq == NULL)

4094

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4095

++

4096

++	/*

4097

++	 * Pin the queue now that it's allocated, scheduler exit will

4098

++	 * prune it.

4099

++	 */

4100

++	if (!is_sync && *async_bfqq == NULL) {

4101

++		atomic_inc(&bfqq->ref);

4102

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

4103

++			     bfqq, atomic_read(&bfqq->ref));

4104

++		*async_bfqq = bfqq;

4105

++	}

4106

++

4107

++	atomic_inc(&bfqq->ref);

4108

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

4109

++		     atomic_read(&bfqq->ref));

4110

++	return bfqq;

4111

++}

4112

++

4113

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

4114

++				    struct bfq_io_cq *bic)

4115

++{

4116

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

4117

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

4118

++

4119

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

4120

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

4121

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

4122

++				bic->ttime.ttime_samples;

4123

++}

4124

++

4125

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

4126

++				   struct bfq_queue *bfqq,

4127

++				   struct request *rq)

4128

++{

4129

++	sector_t sdist;

4130

++	u64 total;

4131

++

4132

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

4133

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

4134

++	else

4135

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

4136

++

4137

++	/*

4138

++	 * Don't allow the seek distance to get too large from the

4139

++	 * odd fragment, pagein, etc.

4140

++	 */

4141

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

4142

++		sdist = 0;

4143

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

4144

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

4145

++	else

4146

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

4147

++

4148

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

4149

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

4150

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

4151

++	do_div(total, bfqq->seek_samples);

4152

++	bfqq->seek_mean = (sector_t)total;

4153

++

4154

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

4155

++			(u64)bfqq->seek_mean);

4156

++}

4157

++

4158

++/*

4159

++ * Disable idle window if the process thinks too long or seeks so much that

4160

++ * it doesn't matter.

4161

++ */

4162

++static void bfq_update_idle_window(struct bfq_data *bfqd,

4163

++				   struct bfq_queue *bfqq,

4164

++				   struct bfq_io_cq *bic)

4165

++{

4166

++	int enable_idle;

4167

++

4168

++	/* Don't idle for async or idle io prio class. */

4169

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

4170

++		return;

4171

++

4172

++	enable_idle = bfq_bfqq_idle_window(bfqq);

4173

++

4174

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

4175

++	    bfqd->bfq_slice_idle == 0 ||

4176

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

4177

++			bfqq->wr_coeff == 1))

4178

++		enable_idle = 0;

4179

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

4180

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

4181

++			bfqq->wr_coeff == 1)

4182

++			enable_idle = 0;

4183

++		else

4184

++			enable_idle = 1;

4185

++	}

4186

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

4187

++		enable_idle);

4188

++

4189

++	if (enable_idle)

4190

++		bfq_mark_bfqq_idle_window(bfqq);

4191

++	else

4192

++		bfq_clear_bfqq_idle_window(bfqq);

4193

++}

4194

++

4195

++/*

4196

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

4197

++ * something we should do about it.

4198

++ */

4199

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4200

++			    struct request *rq)

4201

++{

4202

++	struct bfq_io_cq *bic = RQ_BIC(rq);

4203

++

4204

++	if (rq->cmd_flags & REQ_META)

4205

++		bfqq->meta_pending++;

4206

++

4207

++	bfq_update_io_thinktime(bfqd, bic);

4208

++	bfq_update_io_seektime(bfqd, bfqq, rq);

4209

++	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

4210

++		bfq_clear_bfqq_constantly_seeky(bfqq);

4211

++		if (!blk_queue_nonrot(bfqd->queue)) {

4212

++			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

4213

++			bfqd->const_seeky_busy_in_flight_queues--;

4214

++		}

4215

++	}

4216

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

4217

++	    !BFQQ_SEEKY(bfqq))

4218

++		bfq_update_idle_window(bfqd, bfqq, bic);

4219

++

4220

++	bfq_log_bfqq(bfqd, bfqq,

4221

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

4222

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

4223

++		     (long long unsigned)bfqq->seek_mean);

4224

++

4225

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

4226

++

4227

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

4228

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

4229

++				blk_rq_sectors(rq) < 32;

4230

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

4231

++

4232

++		/*

4233

++		 * There is just this request queued: if the request

4234

++		 * is small and the queue is not to be expired, then

4235

++		 * just exit.

4236

++		 *

4237

++		 * In this way, if the disk is being idled to wait for

4238

++		 * a new request from the in-service queue, we avoid

4239

++		 * unplugging the device and committing the disk to serve

4240

++		 * just a small request. On the contrary, we wait for

4241

++		 * the block layer to decide when to unplug the device:

4242

++		 * hopefully, new requests will be merged to this one

4243

++		 * quickly, then the device will be unplugged and

4244

++		 * larger requests will be dispatched.

4245

++		 */

4246

++		if (small_req && !budget_timeout)

4247

++			return;

4248

++

4249

++		/*

4250

++		 * A large enough request arrived, or the queue is to

4251

++		 * be expired: in both cases disk idling is to be

4252

++		 * stopped, so clear wait_request flag and reset

4253

++		 * timer.

4254

++		 */

4255

++		bfq_clear_bfqq_wait_request(bfqq);

4256

++		del_timer(&bfqd->idle_slice_timer);

4257

++

4258

++		/*

4259

++		 * The queue is not empty, because a new request just

4260

++		 * arrived. Hence we can safely expire the queue, in

4261

++		 * case of budget timeout, without risking that the

4262

++		 * timestamps of the queue are not updated correctly.

4263

++		 * See [1] for more details.

4264

++		 */

4265

++		if (budget_timeout)

4266

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4267

++

4268

++		/*

4269

++		 * Let the request rip immediately, or let a new queue be

4270

++		 * selected if bfqq has just been expired.

4271

++		 */

4272

++		__blk_run_queue(bfqd->queue);

4273

++	}

4274

++}

4275

++

4276

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

4277

++{

4278

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4279

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4280

++

4281

++	assert_spin_locked(bfqd->queue->queue_lock);

4282

++

4283

++	bfq_add_request(rq);

4284

++

4285

++	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

4286

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

4287

++

4288

++	bfq_rq_enqueued(bfqd, bfqq, rq);

4289

++}

4290

++

4291

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

4292

++{

4293

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

4294

++				     bfqd->rq_in_driver);

4295

++

4296

++	if (bfqd->hw_tag == 1)

4297

++		return;

4298

++

4299

++	/*

4300

++	 * This sample is valid if the number of outstanding requests

4301

++	 * is large enough to allow a queueing behavior.  Note that the

4302

++	 * sum is not exact, as it's not taking into account deactivated

4303

++	 * requests.

4304

++	 */

4305

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

4306

++		return;

4307

++

4308

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

4309

++		return;

4310

++

4311

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

4312

++	bfqd->max_rq_in_driver = 0;

4313

++	bfqd->hw_tag_samples = 0;

4314

++}

4315

++

4316

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

4317

++{

4318

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4319

++	struct bfq_data *bfqd = bfqq->bfqd;

4320

++	bool sync = bfq_bfqq_sync(bfqq);

4321

++

4322

++	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

4323

++		     blk_rq_sectors(rq), sync);

4324

++

4325

++	bfq_update_hw_tag(bfqd);

4326

++

4327

++	BUG_ON(!bfqd->rq_in_driver);

4328

++	BUG_ON(!bfqq->dispatched);

4329

++	bfqd->rq_in_driver--;

4330

++	bfqq->dispatched--;

4331

++

4332

++	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

4333

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

4334

++					&bfqd->queue_weights_tree);

4335

++		if (!blk_queue_nonrot(bfqd->queue)) {

4336

++			BUG_ON(!bfqd->busy_in_flight_queues);

4337

++			bfqd->busy_in_flight_queues--;

4338

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

4339

++				BUG_ON(!bfqd->

4340

++					const_seeky_busy_in_flight_queues);

4341

++				bfqd->const_seeky_busy_in_flight_queues--;

4342

++			}

4343

++		}

4344

++	}

4345

++

4346

++	if (sync) {

4347

++		bfqd->sync_flight--;

4348

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

4349

++	}

4350

++

4351

++	/*

4352

++	 * If we are waiting to discover whether the request pattern of the

4353

++	 * task associated with the queue is actually isochronous, and

4354

++	 * both requisites for this condition to hold are satisfied, then

4355

++	 * compute soft_rt_next_start (see the comments to the function

4356

++	 * bfq_bfqq_softrt_next_start()).

4357

++	 */

4358

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

4359

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

4360

++		bfqq->soft_rt_next_start =

4361

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

4362

++

4363

++	/*

4364

++	 * If this is the in-service queue, check if it needs to be expired,

4365

++	 * or if we want to idle in case it has no pending requests.

4366

++	 */

4367

++	if (bfqd->in_service_queue == bfqq) {

4368

++		if (bfq_bfqq_budget_new(bfqq))

4369

++			bfq_set_budget_timeout(bfqd);

4370

++

4371

++		if (bfq_bfqq_must_idle(bfqq)) {

4372

++			bfq_arm_slice_timer(bfqd);

4373

++			goto out;

4374

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

4375

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4376

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

4377

++			 (bfqq->dispatched == 0 ||

4378

++			  !bfq_bfqq_must_not_expire(bfqq)))

4379

++			bfq_bfqq_expire(bfqd, bfqq, 0,

4380

++					BFQ_BFQQ_NO_MORE_REQUESTS);

4381

++	}

4382

++

4383

++	if (!bfqd->rq_in_driver)

4384

++		bfq_schedule_dispatch(bfqd);

4385

++

4386

++out:

4387

++	return;

4388

++}

4389

++

4390

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

4391

++{

4392

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

4393

++		bfq_clear_bfqq_must_alloc(bfqq);

4394

++		return ELV_MQUEUE_MUST;

4395

++	}

4396

++

4397

++	return ELV_MQUEUE_MAY;

4398

++}

4399

++

4400

++static int bfq_may_queue(struct request_queue *q, int rw)

4401

++{

4402

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4403

++	struct task_struct *tsk = current;

4404

++	struct bfq_io_cq *bic;

4405

++	struct bfq_queue *bfqq;

4406

++

4407

++	/*

4408

++	 * Don't force setup of a queue from here, as a call to may_queue

4409

++	 * does not necessarily imply that a request actually will be

4410

++	 * queued. So just lookup a possibly existing queue, or return

4411

++	 * 'may queue' if that fails.

4412

++	 */

4413

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

4414

++	if (bic == NULL)

4415

++		return ELV_MQUEUE_MAY;

4416

++

4417

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

4418

++	if (bfqq != NULL)

4419

++		return __bfq_may_queue(bfqq);

4420

++

4421

++	return ELV_MQUEUE_MAY;

4422

++}

4423

++

4424

++/*

4425

++ * Queue lock held here.

4426

++ */

4427

++static void bfq_put_request(struct request *rq)

4428

++{

4429

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4430

++

4431

++	if (bfqq != NULL) {

4432

++		const int rw = rq_data_dir(rq);

4433

++

4434

++		BUG_ON(!bfqq->allocated[rw]);

4435

++		bfqq->allocated[rw]--;

4436

++

4437

++		rq->elv.priv[0] = NULL;

4438

++		rq->elv.priv[1] = NULL;

4439

++

4440

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

4441

++			     bfqq, atomic_read(&bfqq->ref));

4442

++		bfq_put_queue(bfqq);

4443

++	}

4444

++}

4445

++

4446

++static struct bfq_queue *

4447

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

4448

++		struct bfq_queue *bfqq)

4449

++{

4450

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

4451

++		(long unsigned)bfqq->new_bfqq->pid);

4452

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

4453

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

4454

++	bfq_put_queue(bfqq);

4455

++	return bic_to_bfqq(bic, 1);

4456

++}

4457

++

4458

++/*

4459

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

4460

++ * was the last process referring to said bfqq.

4461

++ */

4462

++static struct bfq_queue *

4463

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

4464

++{

4465

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

4466

++	if (bfqq_process_refs(bfqq) == 1) {

4467

++		bfqq->pid = current->pid;

4468

++		bfq_clear_bfqq_coop(bfqq);

4469

++		bfq_clear_bfqq_split_coop(bfqq);

4470

++		return bfqq;

4471

++	}

4472

++

4473

++	bic_set_bfqq(bic, NULL, 1);

4474

++

4475

++	bfq_put_cooperator(bfqq);

4476

++

4477

++	bfq_put_queue(bfqq);

4478

++	return NULL;

4479

++}

4480

++

4481

++/*

4482

++ * Allocate bfq data structures associated with this request.

4483

++ */

4484

++static int bfq_set_request(struct request_queue *q, struct request *rq,

4485

++			   struct bio *bio, gfp_t gfp_mask)

4486

++{

4487

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4488

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

4489

++	const int rw = rq_data_dir(rq);

4490

++	const int is_sync = rq_is_sync(rq);

4491

++	struct bfq_queue *bfqq;

4492

++	struct bfq_group *bfqg;

4493

++	unsigned long flags;

4494

++

4495

++	might_sleep_if(gfp_mask & __GFP_WAIT);

4496

++

4497

++	bfq_check_ioprio_change(bic);

4498

++

4499

++	spin_lock_irqsave(q->queue_lock, flags);

4500

++

4501

++	if (bic == NULL)

4502

++		goto queue_fail;

4503

++

4504

++	bfqg = bfq_bic_update_cgroup(bic);

4505

++

4506

++new_queue:

4507

++	bfqq = bic_to_bfqq(bic, is_sync);

4508

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4509

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4510

++		bic_set_bfqq(bic, bfqq, is_sync);

4511

++	} else {

4512

++		/*

4513

++		 * If the queue was seeky for too long, break it apart.

4514

++		 */

4515

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4516

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4517

++			bfqq = bfq_split_bfqq(bic, bfqq);

4518

++			if (!bfqq)

4519

++				goto new_queue;

4520

++		}

4521

++

4522

++		/*

4523

++		 * Check to see if this queue is scheduled to merge with

4524

++		 * another closely cooperating queue. The merging of queues

4525

++		 * happens here as it must be done in process context.

4526

++		 * The reference on new_bfqq was taken in merge_bfqqs.

4527

++		 */

4528

++		if (bfqq->new_bfqq != NULL)

4529

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4530

++	}

4531

++

4532

++	bfqq->allocated[rw]++;

4533

++	atomic_inc(&bfqq->ref);

4534

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4535

++		     atomic_read(&bfqq->ref));

4536

++

4537

++	rq->elv.priv[0] = bic;

4538

++	rq->elv.priv[1] = bfqq;

4539

++

4540

++	spin_unlock_irqrestore(q->queue_lock, flags);

4541

++

4542

++	return 0;

4543

++

4544

++queue_fail:

4545

++	bfq_schedule_dispatch(bfqd);

4546

++	spin_unlock_irqrestore(q->queue_lock, flags);

4547

++

4548

++	return 1;

4549

++}

4550

++

4551

++static void bfq_kick_queue(struct work_struct *work)

4552

++{

4553

++	struct bfq_data *bfqd =

4554

++		container_of(work, struct bfq_data, unplug_work);

4555

++	struct request_queue *q = bfqd->queue;

4556

++

4557

++	spin_lock_irq(q->queue_lock);

4558

++	__blk_run_queue(q);

4559

++	spin_unlock_irq(q->queue_lock);

4560

++}

4561

++

4562

++/*

4563

++ * Handler of the expiration of the timer running if the in-service queue

4564

++ * is idling inside its time slice.

4565

++ */

4566

++static void bfq_idle_slice_timer(unsigned long data)

4567

++{

4568

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4569

++	struct bfq_queue *bfqq;

4570

++	unsigned long flags;

4571

++	enum bfqq_expiration reason;

4572

++

4573

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4574

++

4575

++	bfqq = bfqd->in_service_queue;

4576

++	/*

4577

++	 * Theoretical race here: the in-service queue can be NULL or

4578

++	 * different from the queue that was idling if the timer handler

4579

++	 * spins on the queue_lock and a new request arrives for the

4580

++	 * current queue and there is a full dispatch cycle that changes

4581

++	 * the in-service queue.  This can hardly happen, but in the worst

4582

++	 * case we just expire a queue too early.

4583

++	 */

4584

++	if (bfqq != NULL) {

4585

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4586

++		if (bfq_bfqq_budget_timeout(bfqq))

4587

++			/*

4588

++			 * Also here the queue can be safely expired

4589

++			 * for budget timeout without wasting

4590

++			 * guarantees

4591

++			 */

4592

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4593

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4594

++			/*

4595

++			 * The queue may not be empty upon timer expiration,

4596

++			 * because we may not disable the timer when the

4597

++			 * first request of the in-service queue arrives

4598

++			 * during disk idling.

4599

++			 */

4600

++			reason = BFQ_BFQQ_TOO_IDLE;

4601

++		else

4602

++			goto schedule_dispatch;

4603

++

4604

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4605

++	}

4606

++

4607

++schedule_dispatch:

4608

++	bfq_schedule_dispatch(bfqd);

4609

++

4610

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4611

++}

4612

++

4613

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4614

++{

4615

++	del_timer_sync(&bfqd->idle_slice_timer);

4616

++	cancel_work_sync(&bfqd->unplug_work);

4617

++}

4618

++

4619

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4620

++					struct bfq_queue **bfqq_ptr)

4621

++{

4622

++	struct bfq_group *root_group = bfqd->root_group;

4623

++	struct bfq_queue *bfqq = *bfqq_ptr;

4624

++

4625

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4626

++	if (bfqq != NULL) {

4627

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4628

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4629

++			     bfqq, atomic_read(&bfqq->ref));

4630

++		bfq_put_queue(bfqq);

4631

++		*bfqq_ptr = NULL;

4632

++	}

4633

++}

4634

++

4635

++/*

4636

++ * Release all the bfqg references to its async queues.  If we are

4637

++ * deallocating the group these queues may still contain requests, so

4638

++ * we reparent them to the root cgroup (i.e., the only one that will

4639

++ * exist for sure until all the requests on a device are gone).

4640

++ */

4641

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4642

++{

4643

++	int i, j;

4644

++

4645

++	for (i = 0; i < 2; i++)

4646

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4647

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4648

++

4649

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4650

++}

4651

++

4652

++static void bfq_exit_queue(struct elevator_queue *e)

4653

++{

4654

++	struct bfq_data *bfqd = e->elevator_data;

4655

++	struct request_queue *q = bfqd->queue;

4656

++	struct bfq_queue *bfqq, *n;

4657

++

4658

++	bfq_shutdown_timer_wq(bfqd);

4659

++

4660

++	spin_lock_irq(q->queue_lock);

4661

++

4662

++	BUG_ON(bfqd->in_service_queue != NULL);

4663

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4664

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4665

++

4666

++	bfq_disconnect_groups(bfqd);

4667

++	spin_unlock_irq(q->queue_lock);

4668

++

4669

++	bfq_shutdown_timer_wq(bfqd);

4670

++

4671

++	synchronize_rcu();

4672

++

4673

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4674

++

4675

++	bfq_free_root_group(bfqd);

4676

++	kfree(bfqd);

4677

++}

4678

++

4679

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4680

++{

4681

++	struct bfq_group *bfqg;

4682

++	struct bfq_data *bfqd;

4683

++	struct elevator_queue *eq;

4684

++

4685

++	eq = elevator_alloc(q, e);

4686

++	if (eq == NULL)

4687

++		return -ENOMEM;

4688

++

4689

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4690

++	if (bfqd == NULL) {

4691

++		kobject_put(&eq->kobj);

4692

++		return -ENOMEM;

4693

++	}

4694

++	eq->elevator_data = bfqd;

4695

++

4696

++	/*

4697

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4698

++	 * Grab a permanent reference to it, so that the normal code flow

4699

++	 * will not attempt to free it.

4700

++	 */

4701

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);

4702

++	atomic_inc(&bfqd->oom_bfqq.ref);

4703

++	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

4704

++	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;

4705

++	bfqd->oom_bfqq.entity.new_weight =

4706

++		bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);

4707

++	/*

4708

++	 * Trigger weight initialization, according to ioprio, at the

4709

++	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

4710

++	 * class won't be changed any more.

4711

++	 */

4712

++	bfqd->oom_bfqq.entity.ioprio_changed = 1;

4713

++

4714

++	bfqd->queue = q;

4715

++

4716

++	spin_lock_irq(q->queue_lock);

4717

++	q->elevator = eq;

4718

++	spin_unlock_irq(q->queue_lock);

4719

++

4720

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

4721

++	if (bfqg == NULL) {

4722

++		kfree(bfqd);

4723

++		kobject_put(&eq->kobj);

4724

++		return -ENOMEM;

4725

++	}

4726

++

4727

++	bfqd->root_group = bfqg;

4728

++	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

4729

++#ifdef CONFIG_CGROUP_BFQIO

4730

++	bfqd->active_numerous_groups = 0;

4731

++#endif

4732

++

4733

++	init_timer(&bfqd->idle_slice_timer);

4734

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4735

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4736

++

4737

++	bfqd->rq_pos_tree = RB_ROOT;

4738

++	bfqd->queue_weights_tree = RB_ROOT;

4739

++	bfqd->group_weights_tree = RB_ROOT;

4740

++

4741

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4742

++

4743

++	INIT_LIST_HEAD(&bfqd->active_list);

4744

++	INIT_LIST_HEAD(&bfqd->idle_list);

4745

++	INIT_HLIST_HEAD(&bfqd->burst_list);

4746

++

4747

++	bfqd->hw_tag = -1;

4748

++

4749

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4750

++

4751

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4752

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4753

++	bfqd->bfq_back_max = bfq_back_max;

4754

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4755

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4756

++	bfqd->bfq_class_idle_last_service = 0;

4757

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4758

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4759

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4760

++

4761

++	bfqd->bfq_coop_thresh = 2;

4762

++	bfqd->bfq_failed_cooperations = 7000;

4763

++	bfqd->bfq_requests_within_timer = 120;

4764

++

4765

++	bfqd->bfq_large_burst_thresh = 11;

4766

++	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

4767

++

4768

++	bfqd->low_latency = true;

4769

++

4770

++	bfqd->bfq_wr_coeff = 20;

4771

++	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

4772

++	bfqd->bfq_wr_max_time = 0;

4773

++	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

4774

++	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);

4775

++	bfqd->bfq_wr_max_softrt_rate = 7000; /*

4776

++					      * Approximate rate required

4777

++					      * to playback or record a

4778

++					      * high-definition compressed

4779

++					      * video.

4780

++					      */

4781

++	bfqd->wr_busy_queues = 0;

4782

++	bfqd->busy_in_flight_queues = 0;

4783

++	bfqd->const_seeky_busy_in_flight_queues = 0;

4784

++

4785

++	/*

4786

++	 * Begin by assuming, optimistically, that the device peak rate is

4787

++	 * equal to the highest reference rate.

4788

++	 */

4789

++	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

4790

++			T_fast[blk_queue_nonrot(bfqd->queue)];

4791

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

4792

++	bfqd->device_speed = BFQ_BFQD_FAST;

4793

++

4794

++	return 0;

4795

++}

4796

++

4797

++static void bfq_slab_kill(void)

4798

++{

4799

++	if (bfq_pool != NULL)

4800

++		kmem_cache_destroy(bfq_pool);

4801

++}

4802

++

4803

++static int __init bfq_slab_setup(void)

4804

++{

4805

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4806

++	if (bfq_pool == NULL)

4807

++		return -ENOMEM;

4808

++	return 0;

4809

++}

4810

++

4811

++static ssize_t bfq_var_show(unsigned int var, char *page)

4812

++{

4813

++	return sprintf(page, "%d\n", var);

4814

++}

4815

++

4816

++static ssize_t bfq_var_store(unsigned long *var, const char *page,

4817

++			     size_t count)

4818

++{

4819

++	unsigned long new_val;

4820

++	int ret = kstrtoul(page, 10, &new_val);

4821

++

4822

++	if (ret == 0)

4823

++		*var = new_val;

4824

++

4825

++	return count;

4826

++}

4827

++

4828

++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)

4829

++{

4830

++	struct bfq_data *bfqd = e->elevator_data;

4831

++	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?

4832

++		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :

4833

++		       jiffies_to_msecs(bfq_wr_duration(bfqd)));

4834

++}

4835

++

4836

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4837

++{

4838

++	struct bfq_queue *bfqq;

4839

++	struct bfq_data *bfqd = e->elevator_data;

4840

++	ssize_t num_char = 0;

4841

++

4842

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4843

++			    bfqd->queued);

4844

++

4845

++	spin_lock_irq(bfqd->queue->queue_lock);

4846

++

4847

++	num_char += sprintf(page + num_char, "Active:\n");

4848

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4849

++	  num_char += sprintf(page + num_char,

4850

++			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",

4851

++			      bfqq->pid,

4852

++			      bfqq->entity.weight,

4853

++			      bfqq->queued[0],

4854

++			      bfqq->queued[1],

4855

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

4856

++			jiffies_to_msecs(bfqq->wr_cur_max_time));

4857

++	}

4858

++

4859

++	num_char += sprintf(page + num_char, "Idle:\n");

4860

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4861

++			num_char += sprintf(page + num_char,

4862

++				"pid%d: weight %hu, dur %d/%u\n",

4863

++				bfqq->pid,

4864

++				bfqq->entity.weight,

4865

++				jiffies_to_msecs(jiffies -

4866

++					bfqq->last_wr_start_finish),

4867

++				jiffies_to_msecs(bfqq->wr_cur_max_time));

4868

++	}

4869

++

4870

++	spin_unlock_irq(bfqd->queue->queue_lock);

4871

++

4872

++	return num_char;

4873

++}

4874

++

4875

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4876

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4877

++{									\

4878

++	struct bfq_data *bfqd = e->elevator_data;			\

4879

++	unsigned int __data = __VAR;					\

4880

++	if (__CONV)							\

4881

++		__data = jiffies_to_msecs(__data);			\

4882

++	return bfq_var_show(__data, (page));				\

4883

++}

4884

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4885

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4886

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4887

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4888

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4889

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4890

++SHOW_FUNCTION(bfq_max_budget_async_rq_show,

4891

++	      bfqd->bfq_max_budget_async_rq, 0);

4892

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4893

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4894

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4895

++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

4896

++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

4897

++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);

4898

++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,

4899

++	1);

4900

++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);

4901

++#undef SHOW_FUNCTION

4902

++

4903

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4904

++static ssize_t								\

4905

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4906

++{									\

4907

++	struct bfq_data *bfqd = e->elevator_data;			\

4908

++	unsigned long uninitialized_var(__data);			\

4909

++	int ret = bfq_var_store(&__data, (page), count);		\

4910

++	if (__data < (MIN))						\

4911

++		__data = (MIN);						\

4912

++	else if (__data > (MAX))					\

4913

++		__data = (MAX);						\

4914

++	if (__CONV)							\

4915

++		*(__PTR) = msecs_to_jiffies(__data);			\

4916

++	else								\

4917

++		*(__PTR) = __data;					\

4918

++	return ret;							\

4919

++}

4920

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4921

++		INT_MAX, 1);

4922

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4923

++		INT_MAX, 1);

4924

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4925

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4926

++		INT_MAX, 0);

4927

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4928

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4929

++		1, INT_MAX, 0);

4930

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4931

++		INT_MAX, 1);

4932

++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

4933

++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

4934

++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

4935

++		1);

4936

++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,

4937

++		INT_MAX, 1);

4938

++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,

4939

++		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);

4940

++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,

4941

++		INT_MAX, 0);

4942

++#undef STORE_FUNCTION

4943

++

4944

++/* do nothing for the moment */

4945

++static ssize_t bfq_weights_store(struct elevator_queue *e,

4946

++				    const char *page, size_t count)

4947

++{

4948

++	return count;

4949

++}

4950

++

4951

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4952

++{

4953

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4954

++

4955

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4956

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4957

++	else

4958

++		return bfq_default_max_budget;

4959

++}

4960

++

4961

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4962

++				    const char *page, size_t count)

4963

++{

4964

++	struct bfq_data *bfqd = e->elevator_data;

4965

++	unsigned long uninitialized_var(__data);

4966

++	int ret = bfq_var_store(&__data, (page), count);

4967

++

4968

++	if (__data == 0)

4969

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4970

++	else {

4971

++		if (__data > INT_MAX)

4972

++			__data = INT_MAX;

4973

++		bfqd->bfq_max_budget = __data;

4974

++	}

4975

++

4976

++	bfqd->bfq_user_max_budget = __data;

4977

++

4978

++	return ret;

4979

++}

4980

++

4981

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4982

++				      const char *page, size_t count)

4983

++{

4984

++	struct bfq_data *bfqd = e->elevator_data;

4985

++	unsigned long uninitialized_var(__data);

4986

++	int ret = bfq_var_store(&__data, (page), count);

4987

++

4988

++	if (__data < 1)

4989

++		__data = 1;

4990

++	else if (__data > INT_MAX)

4991

++		__data = INT_MAX;

4992

++

4993

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4994

++	if (bfqd->bfq_user_max_budget == 0)

4995

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4996

++

4997

++	return ret;

4998

++}

4999

++

5000

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

5001

++				     const char *page, size_t count)

5002

++{

5003

++	struct bfq_data *bfqd = e->elevator_data;

5004

++	unsigned long uninitialized_var(__data);

5005

++	int ret = bfq_var_store(&__data, (page), count);

5006

++

5007

++	if (__data > 1)

5008

++		__data = 1;

5009

++	if (__data == 0 && bfqd->low_latency != 0)

5010

++		bfq_end_wr(bfqd);

5011

++	bfqd->low_latency = __data;

5012

++

5013

++	return ret;

5014

++}

5015

++

5016

++#define BFQ_ATTR(name) \

5017

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

5018

++

5019

++static struct elv_fs_entry bfq_attrs[] = {

5020

++	BFQ_ATTR(fifo_expire_sync),

5021

++	BFQ_ATTR(fifo_expire_async),

5022

++	BFQ_ATTR(back_seek_max),

5023

++	BFQ_ATTR(back_seek_penalty),

5024

++	BFQ_ATTR(slice_idle),

5025

++	BFQ_ATTR(max_budget),

5026

++	BFQ_ATTR(max_budget_async_rq),

5027

++	BFQ_ATTR(timeout_sync),

5028

++	BFQ_ATTR(timeout_async),

5029

++	BFQ_ATTR(low_latency),

5030

++	BFQ_ATTR(wr_coeff),

5031

++	BFQ_ATTR(wr_max_time),

5032

++	BFQ_ATTR(wr_rt_max_time),

5033

++	BFQ_ATTR(wr_min_idle_time),

5034

++	BFQ_ATTR(wr_min_inter_arr_async),

5035

++	BFQ_ATTR(wr_max_softrt_rate),

5036

++	BFQ_ATTR(weights),

5037

++	__ATTR_NULL

5038

++};

5039

++

5040

++static struct elevator_type iosched_bfq = {

5041

++	.ops = {

5042

++		.elevator_merge_fn =		bfq_merge,

5043

++		.elevator_merged_fn =		bfq_merged_request,

5044

++		.elevator_merge_req_fn =	bfq_merged_requests,

5045

++		.elevator_allow_merge_fn =	bfq_allow_merge,

5046

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

5047

++		.elevator_add_req_fn =		bfq_insert_request,

5048

++		.elevator_activate_req_fn =	bfq_activate_request,

5049

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

5050

++		.elevator_completed_req_fn =	bfq_completed_request,

5051

++		.elevator_former_req_fn =	elv_rb_former_request,

5052

++		.elevator_latter_req_fn =	elv_rb_latter_request,

5053

++		.elevator_init_icq_fn =		bfq_init_icq,

5054

++		.elevator_exit_icq_fn =		bfq_exit_icq,

5055

++		.elevator_set_req_fn =		bfq_set_request,

5056

++		.elevator_put_req_fn =		bfq_put_request,

5057

++		.elevator_may_queue_fn =	bfq_may_queue,

5058

++		.elevator_init_fn =		bfq_init_queue,

5059

++		.elevator_exit_fn =		bfq_exit_queue,

5060

++	},

5061

++	.icq_size =		sizeof(struct bfq_io_cq),

5062

++	.icq_align =		__alignof__(struct bfq_io_cq),

5063

++	.elevator_attrs =	bfq_attrs,

5064

++	.elevator_name =	"bfq",

5065

++	.elevator_owner =	THIS_MODULE,

5066

++};

5067

++

5068

++static int __init bfq_init(void)

5069

++{

5070

++	/*

5071

++	 * Can be 0 on HZ < 1000 setups.

5072

++	 */

5073

++	if (bfq_slice_idle == 0)

5074

++		bfq_slice_idle = 1;

5075

++

5076

++	if (bfq_timeout_async == 0)

5077

++		bfq_timeout_async = 1;

5078

++

5079

++	if (bfq_slab_setup())

5080

++		return -ENOMEM;

5081

++

5082

++	/*

5083

++	 * Times to load large popular applications for the typical systems

5084

++	 * installed on the reference devices (see the comments before the

5085

++	 * definitions of the two arrays).

5086

++	 */

5087

++	T_slow[0] = msecs_to_jiffies(2600);

5088

++	T_slow[1] = msecs_to_jiffies(1000);

5089

++	T_fast[0] = msecs_to_jiffies(5500);

5090

++	T_fast[1] = msecs_to_jiffies(2000);

5091

++

5092

++	/*

5093

++	 * Thresholds that determine the switch between speed classes (see

5094

++	 * the comments before the definition of the array).

5095

++	 */

5096

++	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

5097

++	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

5098

++

5099

++	elv_register(&iosched_bfq);

5100

++	pr_info("BFQ I/O-scheduler: v7r8");

5101

++

5102

++	return 0;

5103

++}

5104

++

5105

++static void __exit bfq_exit(void)

5106

++{

5107

++	elv_unregister(&iosched_bfq);

5108

++	bfq_slab_kill();

5109

++}

5110

++

5111

++module_init(bfq_init);

5112

++module_exit(bfq_exit);

5113

++

5114

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

5115

++MODULE_LICENSE("GPL");

5116

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

5117

+new file mode 100644

5118

+index 0000000..c343099

5119

+--- /dev/null

5120

++++ b/block/bfq-sched.c

5121

+@@ -0,0 +1,1208 @@

5122

++/*

5123

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

5124

++ *

5125

++ * Based on ideas and code from CFQ:

5126

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5127

++ *

5128

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5129

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5130

++ *

5131

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5132

++ */

5133

++

5134

++#ifdef CONFIG_CGROUP_BFQIO

5135

++#define for_each_entity(entity)	\

5136

++	for (; entity != NULL; entity = entity->parent)

5137

++

5138

++#define for_each_entity_safe(entity, parent) \

5139

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

5140

++

5141

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5142

++						 int extract,

5143

++						 struct bfq_data *bfqd);

5144

++

5145

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5146

++{

5147

++	struct bfq_entity *bfqg_entity;

5148

++	struct bfq_group *bfqg;

5149

++	struct bfq_sched_data *group_sd;

5150

++

5151

++	BUG_ON(next_in_service == NULL);

5152

++

5153

++	group_sd = next_in_service->sched_data;

5154

++

5155

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

5156

++	/*

5157

++	 * bfq_group's my_entity field is not NULL only if the group

5158

++	 * is not the root group. We must not touch the root entity

5159

++	 * as it must never become an in-service entity.

5160

++	 */

5161

++	bfqg_entity = bfqg->my_entity;

5162

++	if (bfqg_entity != NULL)

5163

++		bfqg_entity->budget = next_in_service->budget;

5164

++}

5165

++

5166

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5167

++{

5168

++	struct bfq_entity *next_in_service;

5169

++

5170

++	if (sd->in_service_entity != NULL)

5171

++		/* will update/requeue at the end of service */

5172

++		return 0;

5173

++

5174

++	/*

5175

++	 * NOTE: this can be improved in many ways, such as returning

5176

++	 * 1 (and thus propagating upwards the update) only when the

5177

++	 * budget changes, or caching the bfqq that will be scheduled

5178

++	 * next from this subtree.  By now we worry more about

5179

++	 * correctness than about performance...

5180

++	 */

5181

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

5182

++	sd->next_in_service = next_in_service;

5183

++

5184

++	if (next_in_service != NULL)

5185

++		bfq_update_budget(next_in_service);

5186

++

5187

++	return 1;

5188

++}

5189

++

5190

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5191

++					     struct bfq_entity *entity)

5192

++{

5193

++	BUG_ON(sd->next_in_service != entity);

5194

++}

5195

++#else

5196

++#define for_each_entity(entity)	\

5197

++	for (; entity != NULL; entity = NULL)

5198

++

5199

++#define for_each_entity_safe(entity, parent) \

5200

++	for (parent = NULL; entity != NULL; entity = parent)

5201

++

5202

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

5203

++{

5204

++	return 0;

5205

++}

5206

++

5207

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5208

++					     struct bfq_entity *entity)

5209

++{

5210

++}

5211

++

5212

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5213

++{

5214

++}

5215

++#endif

5216

++

5217

++/*

5218

++ * Shift for timestamp calculations.  This actually limits the maximum

5219

++ * service allowed in one timestamp delta (small shift values increase it),

5220

++ * the maximum total weight that can be used for the queues in the system

5221

++ * (big shift values increase it), and the period of virtual time

5222

++ * wraparounds.

5223

++ */

5224

++#define WFQ_SERVICE_SHIFT	22

5225

++

5226

++/**

5227

++ * bfq_gt - compare two timestamps.

5228

++ * @a: first ts.

5229

++ * @b: second ts.

5230

++ *

5231

++ * Return @a > @b, dealing with wrapping correctly.

5232

++ */

5233

++static inline int bfq_gt(u64 a, u64 b)

5234

++{

5235

++	return (s64)(a - b) > 0;

5236

++}

5237

++

5238

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

5239

++{

5240

++	struct bfq_queue *bfqq = NULL;

5241

++

5242

++	BUG_ON(entity == NULL);

5243

++

5244

++	if (entity->my_sched_data == NULL)

5245

++		bfqq = container_of(entity, struct bfq_queue, entity);

5246

++

5247

++	return bfqq;

5248

++}

5249

++

5250

++

5251

++/**

5252

++ * bfq_delta - map service into the virtual time domain.

5253

++ * @service: amount of service.

5254

++ * @weight: scale factor (weight of an entity or weight sum).

5255

++ */

5256

++static inline u64 bfq_delta(unsigned long service,

5257

++					unsigned long weight)

5258

++{

5259

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

5260

++

5261

++	do_div(d, weight);

5262

++	return d;

5263

++}

5264

++

5265

++/**

5266

++ * bfq_calc_finish - assign the finish time to an entity.

5267

++ * @entity: the entity to act upon.

5268

++ * @service: the service to be charged to the entity.

5269

++ */

5270

++static inline void bfq_calc_finish(struct bfq_entity *entity,

5271

++				   unsigned long service)

5272

++{

5273

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5274

++

5275

++	BUG_ON(entity->weight == 0);

5276

++

5277

++	entity->finish = entity->start +

5278

++		bfq_delta(service, entity->weight);

5279

++

5280

++	if (bfqq != NULL) {

5281

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5282

++			"calc_finish: serv %lu, w %d",

5283

++			service, entity->weight);

5284

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5285

++			"calc_finish: start %llu, finish %llu, delta %llu",

5286

++			entity->start, entity->finish,

5287

++			bfq_delta(service, entity->weight));

5288

++	}

5289

++}

5290

++

5291

++/**

5292

++ * bfq_entity_of - get an entity from a node.

5293

++ * @node: the node field of the entity.

5294

++ *

5295

++ * Convert a node pointer to the relative entity.  This is used only

5296

++ * to simplify the logic of some functions and not as the generic

5297

++ * conversion mechanism because, e.g., in the tree walking functions,

5298

++ * the check for a %NULL value would be redundant.

5299

++ */

5300

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

5301

++{

5302

++	struct bfq_entity *entity = NULL;

5303

++

5304

++	if (node != NULL)

5305

++		entity = rb_entry(node, struct bfq_entity, rb_node);

5306

++

5307

++	return entity;

5308

++}

5309

++

5310

++/**

5311

++ * bfq_extract - remove an entity from a tree.

5312

++ * @root: the tree root.

5313

++ * @entity: the entity to remove.

5314

++ */

5315

++static inline void bfq_extract(struct rb_root *root,

5316

++			       struct bfq_entity *entity)

5317

++{

5318

++	BUG_ON(entity->tree != root);

5319

++

5320

++	entity->tree = NULL;

5321

++	rb_erase(&entity->rb_node, root);

5322

++}

5323

++

5324

++/**

5325

++ * bfq_idle_extract - extract an entity from the idle tree.

5326

++ * @st: the service tree of the owning @entity.

5327

++ * @entity: the entity being removed.

5328

++ */

5329

++static void bfq_idle_extract(struct bfq_service_tree *st,

5330

++			     struct bfq_entity *entity)

5331

++{

5332

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5333

++	struct rb_node *next;

5334

++

5335

++	BUG_ON(entity->tree != &st->idle);

5336

++

5337

++	if (entity == st->first_idle) {

5338

++		next = rb_next(&entity->rb_node);

5339

++		st->first_idle = bfq_entity_of(next);

5340

++	}

5341

++

5342

++	if (entity == st->last_idle) {

5343

++		next = rb_prev(&entity->rb_node);

5344

++		st->last_idle = bfq_entity_of(next);

5345

++	}

5346

++

5347

++	bfq_extract(&st->idle, entity);

5348

++

5349

++	if (bfqq != NULL)

5350

++		list_del(&bfqq->bfqq_list);

5351

++}

5352

++

5353

++/**

5354

++ * bfq_insert - generic tree insertion.

5355

++ * @root: tree root.

5356

++ * @entity: entity to insert.

5357

++ *

5358

++ * This is used for the idle and the active tree, since they are both

5359

++ * ordered by finish time.

5360

++ */

5361

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

5362

++{

5363

++	struct bfq_entity *entry;

5364

++	struct rb_node **node = &root->rb_node;

5365

++	struct rb_node *parent = NULL;

5366

++

5367

++	BUG_ON(entity->tree != NULL);

5368

++

5369

++	while (*node != NULL) {

5370

++		parent = *node;

5371

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

5372

++

5373

++		if (bfq_gt(entry->finish, entity->finish))

5374

++			node = &parent->rb_left;

5375

++		else

5376

++			node = &parent->rb_right;

5377

++	}

5378

++

5379

++	rb_link_node(&entity->rb_node, parent, node);

5380

++	rb_insert_color(&entity->rb_node, root);

5381

++

5382

++	entity->tree = root;

5383

++}

5384

++

5385

++/**

5386

++ * bfq_update_min - update the min_start field of a entity.

5387

++ * @entity: the entity to update.

5388

++ * @node: one of its children.

5389

++ *

5390

++ * This function is called when @entity may store an invalid value for

5391

++ * min_start due to updates to the active tree.  The function  assumes

5392

++ * that the subtree rooted at @node (which may be its left or its right

5393

++ * child) has a valid min_start value.

5394

++ */

5395

++static inline void bfq_update_min(struct bfq_entity *entity,

5396

++				  struct rb_node *node)

5397

++{

5398

++	struct bfq_entity *child;

5399

++

5400

++	if (node != NULL) {

5401

++		child = rb_entry(node, struct bfq_entity, rb_node);

5402

++		if (bfq_gt(entity->min_start, child->min_start))

5403

++			entity->min_start = child->min_start;

5404

++	}

5405

++}

5406

++

5407

++/**

5408

++ * bfq_update_active_node - recalculate min_start.

5409

++ * @node: the node to update.

5410

++ *

5411

++ * @node may have changed position or one of its children may have moved,

5412

++ * this function updates its min_start value.  The left and right subtrees

5413

++ * are assumed to hold a correct min_start value.

5414

++ */

5415

++static inline void bfq_update_active_node(struct rb_node *node)

5416

++{

5417

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

5418

++

5419

++	entity->min_start = entity->start;

5420

++	bfq_update_min(entity, node->rb_right);

5421

++	bfq_update_min(entity, node->rb_left);

5422

++}

5423

++

5424

++/**

5425

++ * bfq_update_active_tree - update min_start for the whole active tree.

5426

++ * @node: the starting node.

5427

++ *

5428

++ * @node must be the deepest modified node after an update.  This function

5429

++ * updates its min_start using the values held by its children, assuming

5430

++ * that they did not change, and then updates all the nodes that may have

5431

++ * changed in the path to the root.  The only nodes that may have changed

5432

++ * are the ones in the path or their siblings.

5433

++ */

5434

++static void bfq_update_active_tree(struct rb_node *node)

5435

++{

5436

++	struct rb_node *parent;

5437

++

5438

++up:

5439

++	bfq_update_active_node(node);

5440

++

5441

++	parent = rb_parent(node);

5442

++	if (parent == NULL)

5443

++		return;

5444

++

5445

++	if (node == parent->rb_left && parent->rb_right != NULL)

5446

++		bfq_update_active_node(parent->rb_right);

5447

++	else if (parent->rb_left != NULL)

5448

++		bfq_update_active_node(parent->rb_left);

5449

++

5450

++	node = parent;

5451

++	goto up;

5452

++}

5453

++

5454

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

5455

++				 struct bfq_entity *entity,

5456

++				 struct rb_root *root);

5457

++

5458

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

5459

++				    struct bfq_entity *entity,

5460

++				    struct rb_root *root);

5461

++

5462

++

5463

++/**

5464

++ * bfq_active_insert - insert an entity in the active tree of its

5465

++ *                     group/device.

5466

++ * @st: the service tree of the entity.

5467

++ * @entity: the entity being inserted.

5468

++ *

5469

++ * The active tree is ordered by finish time, but an extra key is kept

5470

++ * per each node, containing the minimum value for the start times of

5471

++ * its children (and the node itself), so it's possible to search for

5472

++ * the eligible node with the lowest finish time in logarithmic time.

5473

++ */

5474

++static void bfq_active_insert(struct bfq_service_tree *st,

5475

++			      struct bfq_entity *entity)

5476

++{

5477

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5478

++	struct rb_node *node = &entity->rb_node;

5479

++#ifdef CONFIG_CGROUP_BFQIO

5480

++	struct bfq_sched_data *sd = NULL;

5481

++	struct bfq_group *bfqg = NULL;

5482

++	struct bfq_data *bfqd = NULL;

5483

++#endif

5484

++

5485

++	bfq_insert(&st->active, entity);

5486

++

5487

++	if (node->rb_left != NULL)

5488

++		node = node->rb_left;

5489

++	else if (node->rb_right != NULL)

5490

++		node = node->rb_right;

5491

++

5492

++	bfq_update_active_tree(node);

5493

++

5494

++#ifdef CONFIG_CGROUP_BFQIO

5495

++	sd = entity->sched_data;

5496

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5497

++	BUG_ON(!bfqg);

5498

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5499

++#endif

5500

++	if (bfqq != NULL)

5501

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

5502

++#ifdef CONFIG_CGROUP_BFQIO

5503

++	else { /* bfq_group */

5504

++		BUG_ON(!bfqd);

5505

++		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);

5506

++	}

5507

++	if (bfqg != bfqd->root_group) {

5508

++		BUG_ON(!bfqg);

5509

++		BUG_ON(!bfqd);

5510

++		bfqg->active_entities++;

5511

++		if (bfqg->active_entities == 2)

5512

++			bfqd->active_numerous_groups++;

5513

++	}

5514

++#endif

5515

++}

5516

++

5517

++/**

5518

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

5519

++ * @ioprio: the ioprio value to convert.

5520

++ */

5521

++static inline unsigned short bfq_ioprio_to_weight(int ioprio)

5522

++{

5523

++	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

5524

++	return IOPRIO_BE_NR - ioprio;

5525

++}

5526

++

5527

++/**

5528

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

5529

++ * @weight: the weight value to convert.

5530

++ *

5531

++ * To preserve as mush as possible the old only-ioprio user interface,

5532

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

5533

++ * larger than IOPRIO_BE_NR

5534

++ */

5535

++static inline unsigned short bfq_weight_to_ioprio(int weight)

5536

++{

5537

++	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

5538

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

5539

++}

5540

++

5541

++static inline void bfq_get_entity(struct bfq_entity *entity)

5542

++{

5543

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5544

++

5545

++	if (bfqq != NULL) {

5546

++		atomic_inc(&bfqq->ref);

5547

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

5548

++			     bfqq, atomic_read(&bfqq->ref));

5549

++	}

5550

++}

5551

++

5552

++/**

5553

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

5554

++ * @node: the node being removed.

5555

++ *

5556

++ * Do the first step of an extraction in an rb tree, looking for the

5557

++ * node that will replace @node, and returning the deepest node that

5558

++ * the following modifications to the tree can touch.  If @node is the

5559

++ * last node in the tree return %NULL.

5560

++ */

5561

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

5562

++{

5563

++	struct rb_node *deepest;

5564

++

5565

++	if (node->rb_right == NULL && node->rb_left == NULL)

5566

++		deepest = rb_parent(node);

5567

++	else if (node->rb_right == NULL)

5568

++		deepest = node->rb_left;

5569

++	else if (node->rb_left == NULL)

5570

++		deepest = node->rb_right;

5571

++	else {

5572

++		deepest = rb_next(node);

5573

++		if (deepest->rb_right != NULL)

5574

++			deepest = deepest->rb_right;

5575

++		else if (rb_parent(deepest) != node)

5576

++			deepest = rb_parent(deepest);

5577

++	}

5578

++

5579

++	return deepest;

5580

++}

5581

++

5582

++/**

5583

++ * bfq_active_extract - remove an entity from the active tree.

5584

++ * @st: the service_tree containing the tree.

5585

++ * @entity: the entity being removed.

5586

++ */

5587

++static void bfq_active_extract(struct bfq_service_tree *st,

5588

++			       struct bfq_entity *entity)

5589

++{

5590

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5591

++	struct rb_node *node;

5592

++#ifdef CONFIG_CGROUP_BFQIO

5593

++	struct bfq_sched_data *sd = NULL;

5594

++	struct bfq_group *bfqg = NULL;

5595

++	struct bfq_data *bfqd = NULL;

5596

++#endif

5597

++

5598

++	node = bfq_find_deepest(&entity->rb_node);

5599

++	bfq_extract(&st->active, entity);

5600

++

5601

++	if (node != NULL)

5602

++		bfq_update_active_tree(node);

5603

++

5604

++#ifdef CONFIG_CGROUP_BFQIO

5605

++	sd = entity->sched_data;

5606

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5607

++	BUG_ON(!bfqg);

5608

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5609

++#endif

5610

++	if (bfqq != NULL)

5611

++		list_del(&bfqq->bfqq_list);

5612

++#ifdef CONFIG_CGROUP_BFQIO

5613

++	else { /* bfq_group */

5614

++		BUG_ON(!bfqd);

5615

++		bfq_weights_tree_remove(bfqd, entity,

5616

++					&bfqd->group_weights_tree);

5617

++	}

5618

++	if (bfqg != bfqd->root_group) {

5619

++		BUG_ON(!bfqg);

5620

++		BUG_ON(!bfqd);

5621

++		BUG_ON(!bfqg->active_entities);

5622

++		bfqg->active_entities--;

5623

++		if (bfqg->active_entities == 1) {

5624

++			BUG_ON(!bfqd->active_numerous_groups);

5625

++			bfqd->active_numerous_groups--;

5626

++		}

5627

++	}

5628

++#endif

5629

++}

5630

++

5631

++/**

5632

++ * bfq_idle_insert - insert an entity into the idle tree.

5633

++ * @st: the service tree containing the tree.

5634

++ * @entity: the entity to insert.

5635

++ */

5636

++static void bfq_idle_insert(struct bfq_service_tree *st,

5637

++			    struct bfq_entity *entity)

5638

++{

5639

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5640

++	struct bfq_entity *first_idle = st->first_idle;

5641

++	struct bfq_entity *last_idle = st->last_idle;

5642

++

5643

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5644

++		st->first_idle = entity;

5645

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5646

++		st->last_idle = entity;

5647

++

5648

++	bfq_insert(&st->idle, entity);

5649

++

5650

++	if (bfqq != NULL)

5651

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5652

++}

5653

++

5654

++/**

5655

++ * bfq_forget_entity - remove an entity from the wfq trees.

5656

++ * @st: the service tree.

5657

++ * @entity: the entity being removed.

5658

++ *

5659

++ * Update the device status and forget everything about @entity, putting

5660

++ * the device reference to it, if it is a queue.  Entities belonging to

5661

++ * groups are not refcounted.

5662

++ */

5663

++static void bfq_forget_entity(struct bfq_service_tree *st,

5664

++			      struct bfq_entity *entity)

5665

++{

5666

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5667

++	struct bfq_sched_data *sd;

5668

++

5669

++	BUG_ON(!entity->on_st);

5670

++

5671

++	entity->on_st = 0;

5672

++	st->wsum -= entity->weight;

5673

++	if (bfqq != NULL) {

5674

++		sd = entity->sched_data;

5675

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5676

++			     bfqq, atomic_read(&bfqq->ref));

5677

++		bfq_put_queue(bfqq);

5678

++	}

5679

++}

5680

++

5681

++/**

5682

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5683

++ * @st: service tree for the entity.

5684

++ * @entity: the entity being released.

5685

++ */

5686

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5687

++				struct bfq_entity *entity)

5688

++{

5689

++	bfq_idle_extract(st, entity);

5690

++	bfq_forget_entity(st, entity);

5691

++}

5692

++

5693

++/**

5694

++ * bfq_forget_idle - update the idle tree if necessary.

5695

++ * @st: the service tree to act upon.

5696

++ *

5697

++ * To preserve the global O(log N) complexity we only remove one entry here;

5698

++ * as the idle tree will not grow indefinitely this can be done safely.

5699

++ */

5700

++static void bfq_forget_idle(struct bfq_service_tree *st)

5701

++{

5702

++	struct bfq_entity *first_idle = st->first_idle;

5703

++	struct bfq_entity *last_idle = st->last_idle;

5704

++

5705

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5706

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5707

++		/*

5708

++		 * Forget the whole idle tree, increasing the vtime past

5709

++		 * the last finish time of idle entities.

5710

++		 */

5711

++		st->vtime = last_idle->finish;

5712

++	}

5713

++

5714

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5715

++		bfq_put_idle_entity(st, first_idle);

5716

++}

5717

++

5718

++static struct bfq_service_tree *

5719

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5720

++			 struct bfq_entity *entity)

5721

++{

5722

++	struct bfq_service_tree *new_st = old_st;

5723

++

5724

++	if (entity->ioprio_changed) {

5725

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5726

++		unsigned short prev_weight, new_weight;

5727

++		struct bfq_data *bfqd = NULL;

5728

++		struct rb_root *root;

5729

++#ifdef CONFIG_CGROUP_BFQIO

5730

++		struct bfq_sched_data *sd;

5731

++		struct bfq_group *bfqg;

5732

++#endif

5733

++

5734

++		if (bfqq != NULL)

5735

++			bfqd = bfqq->bfqd;

5736

++#ifdef CONFIG_CGROUP_BFQIO

5737

++		else {

5738

++			sd = entity->my_sched_data;

5739

++			bfqg = container_of(sd, struct bfq_group, sched_data);

5740

++			BUG_ON(!bfqg);

5741

++			bfqd = (struct bfq_data *)bfqg->bfqd;

5742

++			BUG_ON(!bfqd);

5743

++		}

5744

++#endif

5745

++

5746

++		BUG_ON(old_st->wsum < entity->weight);

5747

++		old_st->wsum -= entity->weight;

5748

++

5749

++		if (entity->new_weight != entity->orig_weight) {

5750

++			if (entity->new_weight < BFQ_MIN_WEIGHT ||

5751

++			    entity->new_weight > BFQ_MAX_WEIGHT) {

5752

++				printk(KERN_CRIT "update_weight_prio: "

5753

++						 "new_weight %d\n",

5754

++					entity->new_weight);

5755

++				BUG();

5756

++			}

5757

++			entity->orig_weight = entity->new_weight;

5758

++			entity->ioprio =

5759

++				bfq_weight_to_ioprio(entity->orig_weight);

5760

++		}

5761

++

5762

++		entity->ioprio_class = entity->new_ioprio_class;

5763

++		entity->ioprio_changed = 0;

5764

++

5765

++		/*

5766

++		 * NOTE: here we may be changing the weight too early,

5767

++		 * this will cause unfairness.  The correct approach

5768

++		 * would have required additional complexity to defer

5769

++		 * weight changes to the proper time instants (i.e.,

5770

++		 * when entity->finish <= old_st->vtime).

5771

++		 */

5772

++		new_st = bfq_entity_service_tree(entity);

5773

++

5774

++		prev_weight = entity->weight;

5775

++		new_weight = entity->orig_weight *

5776

++			     (bfqq != NULL ? bfqq->wr_coeff : 1);

5777

++		/*

5778

++		 * If the weight of the entity changes, remove the entity

5779

++		 * from its old weight counter (if there is a counter

5780

++		 * associated with the entity), and add it to the counter

5781

++		 * associated with its new weight.

5782

++		 */

5783

++		if (prev_weight != new_weight) {

5784

++			root = bfqq ? &bfqd->queue_weights_tree :

5785

++				      &bfqd->group_weights_tree;

5786

++			bfq_weights_tree_remove(bfqd, entity, root);

5787

++		}

5788

++		entity->weight = new_weight;

5789

++		/*

5790

++		 * Add the entity to its weights tree only if it is

5791

++		 * not associated with a weight-raised queue.

5792

++		 */

5793

++		if (prev_weight != new_weight &&

5794

++		    (bfqq ? bfqq->wr_coeff == 1 : 1))

5795

++			/* If we get here, root has been initialized. */

5796

++			bfq_weights_tree_add(bfqd, entity, root);

5797

++

5798

++		new_st->wsum += entity->weight;

5799

++

5800

++		if (new_st != old_st)

5801

++			entity->start = new_st->vtime;

5802

++	}

5803

++

5804

++	return new_st;

5805

++}

5806

++

5807

++/**

5808

++ * bfq_bfqq_served - update the scheduler status after selection for

5809

++ *                   service.

5810

++ * @bfqq: the queue being served.

5811

++ * @served: bytes to transfer.

5812

++ *

5813

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5814

++ * are synchronized every time a new bfqq is selected for service.  By now,

5815

++ * we keep it to better check consistency.

5816

++ */

5817

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5818

++{

5819

++	struct bfq_entity *entity = &bfqq->entity;

5820

++	struct bfq_service_tree *st;

5821

++

5822

++	for_each_entity(entity) {

5823

++		st = bfq_entity_service_tree(entity);

5824

++

5825

++		entity->service += served;

5826

++		BUG_ON(entity->service > entity->budget);

5827

++		BUG_ON(st->wsum == 0);

5828

++

5829

++		st->vtime += bfq_delta(served, st->wsum);

5830

++		bfq_forget_idle(st);

5831

++	}

5832

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5833

++}

5834

++

5835

++/**

5836

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5837

++ * @bfqq: the queue that needs a service update.

5838

++ *

5839

++ * When it's not possible to be fair in the service domain, because

5840

++ * a queue is not consuming its budget fast enough (the meaning of

5841

++ * fast depends on the timeout parameter), we charge it a full

5842

++ * budget.  In this way we should obtain a sort of time-domain

5843

++ * fairness among all the seeky/slow queues.

5844

++ */

5845

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5846

++{

5847

++	struct bfq_entity *entity = &bfqq->entity;

5848

++

5849

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5850

++

5851

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5852

++}

5853

++

5854

++/**

5855

++ * __bfq_activate_entity - activate an entity.

5856

++ * @entity: the entity being activated.

5857

++ *

5858

++ * Called whenever an entity is activated, i.e., it is not active and one

5859

++ * of its children receives a new request, or has to be reactivated due to

5860

++ * budget exhaustion.  It uses the current budget of the entity (and the

5861

++ * service received if @entity is active) of the queue to calculate its

5862

++ * timestamps.

5863

++ */

5864

++static void __bfq_activate_entity(struct bfq_entity *entity)

5865

++{

5866

++	struct bfq_sched_data *sd = entity->sched_data;

5867

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5868

++

5869

++	if (entity == sd->in_service_entity) {

5870

++		BUG_ON(entity->tree != NULL);

5871

++		/*

5872

++		 * If we are requeueing the current entity we have

5873

++		 * to take care of not charging to it service it has

5874

++		 * not received.

5875

++		 */

5876

++		bfq_calc_finish(entity, entity->service);

5877

++		entity->start = entity->finish;

5878

++		sd->in_service_entity = NULL;

5879

++	} else if (entity->tree == &st->active) {

5880

++		/*

5881

++		 * Requeueing an entity due to a change of some

5882

++		 * next_in_service entity below it.  We reuse the

5883

++		 * old start time.

5884

++		 */

5885

++		bfq_active_extract(st, entity);

5886

++	} else if (entity->tree == &st->idle) {

5887

++		/*

5888

++		 * Must be on the idle tree, bfq_idle_extract() will

5889

++		 * check for that.

5890

++		 */

5891

++		bfq_idle_extract(st, entity);

5892

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

5893

++				       st->vtime : entity->finish;

5894

++	} else {

5895

++		/*

5896

++		 * The finish time of the entity may be invalid, and

5897

++		 * it is in the past for sure, otherwise the queue

5898

++		 * would have been on the idle tree.

5899

++		 */

5900

++		entity->start = st->vtime;

5901

++		st->wsum += entity->weight;

5902

++		bfq_get_entity(entity);

5903

++

5904

++		BUG_ON(entity->on_st);

5905

++		entity->on_st = 1;

5906

++	}

5907

++

5908

++	st = __bfq_entity_update_weight_prio(st, entity);

5909

++	bfq_calc_finish(entity, entity->budget);

5910

++	bfq_active_insert(st, entity);

5911

++}

5912

++

5913

++/**

5914

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5915

++ * @entity: the entity to activate.

5916

++ *

5917

++ * Activate @entity and all the entities on the path from it to the root.

5918

++ */

5919

++static void bfq_activate_entity(struct bfq_entity *entity)

5920

++{

5921

++	struct bfq_sched_data *sd;

5922

++

5923

++	for_each_entity(entity) {

5924

++		__bfq_activate_entity(entity);

5925

++

5926

++		sd = entity->sched_data;

5927

++		if (!bfq_update_next_in_service(sd))

5928

++			/*

5929

++			 * No need to propagate the activation to the

5930

++			 * upper entities, as they will be updated when

5931

++			 * the in-service entity is rescheduled.

5932

++			 */

5933

++			break;

5934

++	}

5935

++}

5936

++

5937

++/**

5938

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5939

++ * @entity: the entity to deactivate.

5940

++ * @requeue: if false, the entity will not be put into the idle tree.

5941

++ *

5942

++ * Deactivate an entity, independently from its previous state.  If the

5943

++ * entity was not on a service tree just return, otherwise if it is on

5944

++ * any scheduler tree, extract it from that tree, and if necessary

5945

++ * and if the caller did not specify @requeue, put it on the idle tree.

5946

++ *

5947

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

5948

++ * if the entity was in service or if it was the next_in_service for

5949

++ * its sched_data; return %0 otherwise.

5950

++ */

5951

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5952

++{

5953

++	struct bfq_sched_data *sd = entity->sched_data;

5954

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5955

++	int was_in_service = entity == sd->in_service_entity;

5956

++	int ret = 0;

5957

++

5958

++	if (!entity->on_st)

5959

++		return 0;

5960

++

5961

++	BUG_ON(was_in_service && entity->tree != NULL);

5962

++

5963

++	if (was_in_service) {

5964

++		bfq_calc_finish(entity, entity->service);

5965

++		sd->in_service_entity = NULL;

5966

++	} else if (entity->tree == &st->active)

5967

++		bfq_active_extract(st, entity);

5968

++	else if (entity->tree == &st->idle)

5969

++		bfq_idle_extract(st, entity);

5970

++	else if (entity->tree != NULL)

5971

++		BUG();

5972

++

5973

++	if (was_in_service || sd->next_in_service == entity)

5974

++		ret = bfq_update_next_in_service(sd);

5975

++

5976

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5977

++		bfq_forget_entity(st, entity);

5978

++	else

5979

++		bfq_idle_insert(st, entity);

5980

++

5981

++	BUG_ON(sd->in_service_entity == entity);

5982

++	BUG_ON(sd->next_in_service == entity);

5983

++

5984

++	return ret;

5985

++}

5986

++

5987

++/**

5988

++ * bfq_deactivate_entity - deactivate an entity.

5989

++ * @entity: the entity to deactivate.

5990

++ * @requeue: true if the entity can be put on the idle tree

5991

++ */

5992

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5993

++{

5994

++	struct bfq_sched_data *sd;

5995

++	struct bfq_entity *parent;

5996

++

5997

++	for_each_entity_safe(entity, parent) {

5998

++		sd = entity->sched_data;

5999

++

6000

++		if (!__bfq_deactivate_entity(entity, requeue))

6001

++			/*

6002

++			 * The parent entity is still backlogged, and

6003

++			 * we don't need to update it as it is still

6004

++			 * in service.

6005

++			 */

6006

++			break;

6007

++

6008

++		if (sd->next_in_service != NULL)

6009

++			/*

6010

++			 * The parent entity is still backlogged and

6011

++			 * the budgets on the path towards the root

6012

++			 * need to be updated.

6013

++			 */

6014

++			goto update;

6015

++

6016

++		/*

6017

++		 * If we reach there the parent is no more backlogged and

6018

++		 * we want to propagate the dequeue upwards.

6019

++		 */

6020

++		requeue = 1;

6021

++	}

6022

++

6023

++	return;

6024

++

6025

++update:

6026

++	entity = parent;

6027

++	for_each_entity(entity) {

6028

++		__bfq_activate_entity(entity);

6029

++

6030

++		sd = entity->sched_data;

6031

++		if (!bfq_update_next_in_service(sd))

6032

++			break;

6033

++	}

6034

++}

6035

++

6036

++/**

6037

++ * bfq_update_vtime - update vtime if necessary.

6038

++ * @st: the service tree to act upon.

6039

++ *

6040

++ * If necessary update the service tree vtime to have at least one

6041

++ * eligible entity, skipping to its start time.  Assumes that the

6042

++ * active tree of the device is not empty.

6043

++ *

6044

++ * NOTE: this hierarchical implementation updates vtimes quite often,

6045

++ * we may end up with reactivated processes getting timestamps after a

6046

++ * vtime skip done because we needed a ->first_active entity on some

6047

++ * intermediate node.

6048

++ */

6049

++static void bfq_update_vtime(struct bfq_service_tree *st)

6050

++{

6051

++	struct bfq_entity *entry;

6052

++	struct rb_node *node = st->active.rb_node;

6053

++

6054

++	entry = rb_entry(node, struct bfq_entity, rb_node);

6055

++	if (bfq_gt(entry->min_start, st->vtime)) {

6056

++		st->vtime = entry->min_start;

6057

++		bfq_forget_idle(st);

6058

++	}

6059

++}

6060

++

6061

++/**

6062

++ * bfq_first_active_entity - find the eligible entity with

6063

++ *                           the smallest finish time

6064

++ * @st: the service tree to select from.

6065

++ *

6066

++ * This function searches the first schedulable entity, starting from the

6067

++ * root of the tree and going on the left every time on this side there is

6068

++ * a subtree with at least one eligible (start >= vtime) entity. The path on

6069

++ * the right is followed only if a) the left subtree contains no eligible

6070

++ * entities and b) no eligible entity has been found yet.

6071

++ */

6072

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

6073

++{

6074

++	struct bfq_entity *entry, *first = NULL;

6075

++	struct rb_node *node = st->active.rb_node;

6076

++

6077

++	while (node != NULL) {

6078

++		entry = rb_entry(node, struct bfq_entity, rb_node);

6079

++left:

6080

++		if (!bfq_gt(entry->start, st->vtime))

6081

++			first = entry;

6082

++

6083

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

6084

++

6085

++		if (node->rb_left != NULL) {

6086

++			entry = rb_entry(node->rb_left,

6087

++					 struct bfq_entity, rb_node);

6088

++			if (!bfq_gt(entry->min_start, st->vtime)) {

6089

++				node = node->rb_left;

6090

++				goto left;

6091

++			}

6092

++		}

6093

++		if (first != NULL)

6094

++			break;

6095

++		node = node->rb_right;

6096

++	}

6097

++

6098

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

6099

++	return first;

6100

++}

6101

++

6102

++/**

6103

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

6104

++ * @st: the service tree.

6105

++ *

6106

++ * Update the virtual time in @st and return the first eligible entity

6107

++ * it contains.

6108

++ */

6109

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

6110

++						   bool force)

6111

++{

6112

++	struct bfq_entity *entity, *new_next_in_service = NULL;

6113

++

6114

++	if (RB_EMPTY_ROOT(&st->active))

6115

++		return NULL;

6116

++

6117

++	bfq_update_vtime(st);

6118

++	entity = bfq_first_active_entity(st);

6119

++	BUG_ON(bfq_gt(entity->start, st->vtime));

6120

++

6121

++	/*

6122

++	 * If the chosen entity does not match with the sched_data's

6123

++	 * next_in_service and we are forcedly serving the IDLE priority

6124

++	 * class tree, bubble up budget update.

6125

++	 */

6126

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

6127

++		new_next_in_service = entity;

6128

++		for_each_entity(new_next_in_service)

6129

++			bfq_update_budget(new_next_in_service);

6130

++	}

6131

++

6132

++	return entity;

6133

++}

6134

++

6135

++/**

6136

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

6137

++ * @sd: the sched_data.

6138

++ * @extract: if true the returned entity will be also extracted from @sd.

6139

++ *

6140

++ * NOTE: since we cache the next_in_service entity at each level of the

6141

++ * hierarchy, the complexity of the lookup can be decreased with

6142

++ * absolutely no effort just returning the cached next_in_service value;

6143

++ * we prefer to do full lookups to test the consistency of * the data

6144

++ * structures.

6145

++ */

6146

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

6147

++						 int extract,

6148

++						 struct bfq_data *bfqd)

6149

++{

6150

++	struct bfq_service_tree *st = sd->service_tree;

6151

++	struct bfq_entity *entity;

6152

++	int i = 0;

6153

++

6154

++	BUG_ON(sd->in_service_entity != NULL);

6155

++

6156

++	if (bfqd != NULL &&

6157

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

6158

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

6159

++						  true);

6160

++		if (entity != NULL) {

6161

++			i = BFQ_IOPRIO_CLASSES - 1;

6162

++			bfqd->bfq_class_idle_last_service = jiffies;

6163

++			sd->next_in_service = entity;

6164

++		}

6165

++	}

6166

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

6167

++		entity = __bfq_lookup_next_entity(st + i, false);

6168

++		if (entity != NULL) {

6169

++			if (extract) {

6170

++				bfq_check_next_in_service(sd, entity);

6171

++				bfq_active_extract(st + i, entity);

6172

++				sd->in_service_entity = entity;

6173

++				sd->next_in_service = NULL;

6174

++			}

6175

++			break;

6176

++		}

6177

++	}

6178

++

6179

++	return entity;

6180

++}

6181

++

6182

++/*

6183

++ * Get next queue for service.

6184

++ */

6185

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6186

++{

6187

++	struct bfq_entity *entity = NULL;

6188

++	struct bfq_sched_data *sd;

6189

++	struct bfq_queue *bfqq;

6190

++

6191

++	BUG_ON(bfqd->in_service_queue != NULL);

6192

++

6193

++	if (bfqd->busy_queues == 0)

6194

++		return NULL;

6195

++

6196

++	sd = &bfqd->root_group->sched_data;

6197

++	for (; sd != NULL; sd = entity->my_sched_data) {

6198

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

6199

++		BUG_ON(entity == NULL);

6200

++		entity->service = 0;

6201

++	}

6202

++

6203

++	bfqq = bfq_entity_to_bfqq(entity);

6204

++	BUG_ON(bfqq == NULL);

6205

++

6206

++	return bfqq;

6207

++}

6208

++

6209

++/*

6210

++ * Forced extraction of the given queue.

6211

++ */

6212

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

6213

++				      struct bfq_queue *bfqq)

6214

++{

6215

++	struct bfq_entity *entity;

6216

++	struct bfq_sched_data *sd;

6217

++

6218

++	BUG_ON(bfqd->in_service_queue != NULL);

6219

++

6220

++	entity = &bfqq->entity;

6221

++	/*

6222

++	 * Bubble up extraction/update from the leaf to the root.

6223

++	*/

6224

++	for_each_entity(entity) {

6225

++		sd = entity->sched_data;

6226

++		bfq_update_budget(entity);

6227

++		bfq_update_vtime(bfq_entity_service_tree(entity));

6228

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

6229

++		sd->in_service_entity = entity;

6230

++		sd->next_in_service = NULL;

6231

++		entity->service = 0;

6232

++	}

6233

++

6234

++	return;

6235

++}

6236

++

6237

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

6238

++{

6239

++	if (bfqd->in_service_bic != NULL) {

6240

++		put_io_context(bfqd->in_service_bic->icq.ioc);

6241

++		bfqd->in_service_bic = NULL;

6242

++	}

6243

++

6244

++	bfqd->in_service_queue = NULL;

6245

++	del_timer(&bfqd->idle_slice_timer);

6246

++}

6247

++

6248

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6249

++				int requeue)

6250

++{

6251

++	struct bfq_entity *entity = &bfqq->entity;

6252

++

6253

++	if (bfqq == bfqd->in_service_queue)

6254

++		__bfq_bfqd_reset_in_service(bfqd);

6255

++

6256

++	bfq_deactivate_entity(entity, requeue);

6257

++}

6258

++

6259

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6260

++{

6261

++	struct bfq_entity *entity = &bfqq->entity;

6262

++

6263

++	bfq_activate_entity(entity);

6264

++}

6265

++

6266

++/*

6267

++ * Called when the bfqq no longer has requests pending, remove it from

6268

++ * the service tree.

6269

++ */

6270

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6271

++			      int requeue)

6272

++{

6273

++	BUG_ON(!bfq_bfqq_busy(bfqq));

6274

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

6275

++

6276

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

6277

++

6278

++	bfq_clear_bfqq_busy(bfqq);

6279

++

6280

++	BUG_ON(bfqd->busy_queues == 0);

6281

++	bfqd->busy_queues--;

6282

++

6283

++	if (!bfqq->dispatched) {

6284

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

6285

++					&bfqd->queue_weights_tree);

6286

++		if (!blk_queue_nonrot(bfqd->queue)) {

6287

++			BUG_ON(!bfqd->busy_in_flight_queues);

6288

++			bfqd->busy_in_flight_queues--;

6289

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

6290

++				BUG_ON(!bfqd->

6291

++					const_seeky_busy_in_flight_queues);

6292

++				bfqd->const_seeky_busy_in_flight_queues--;

6293

++			}

6294

++		}

6295

++	}

6296

++	if (bfqq->wr_coeff > 1)

6297

++		bfqd->wr_busy_queues--;

6298

++

6299

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

6300

++}

6301

++

6302

++/*

6303

++ * Called when an inactive queue receives a new request.

6304

++ */

6305

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6306

++{

6307

++	BUG_ON(bfq_bfqq_busy(bfqq));

6308

++	BUG_ON(bfqq == bfqd->in_service_queue);

6309

++

6310

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

6311

++

6312

++	bfq_activate_bfqq(bfqd, bfqq);

6313

++

6314

++	bfq_mark_bfqq_busy(bfqq);

6315

++	bfqd->busy_queues++;

6316

++

6317

++	if (!bfqq->dispatched) {

6318

++		if (bfqq->wr_coeff == 1)

6319

++			bfq_weights_tree_add(bfqd, &bfqq->entity,

6320

++					     &bfqd->queue_weights_tree);

6321

++		if (!blk_queue_nonrot(bfqd->queue)) {

6322

++			bfqd->busy_in_flight_queues++;

6323

++			if (bfq_bfqq_constantly_seeky(bfqq))

6324

++				bfqd->const_seeky_busy_in_flight_queues++;

6325

++		}

6326

++	}

6327

++	if (bfqq->wr_coeff > 1)

6328

++		bfqd->wr_busy_queues++;

6329

++}

6330

+diff --git a/block/bfq.h b/block/bfq.h

6331

+new file mode 100644

6332

+index 0000000..e350b5f

6333

+--- /dev/null

6334

++++ b/block/bfq.h

6335

+@@ -0,0 +1,771 @@

6336

++/*

6337

++ * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes.

6338

++ *

6339

++ * Based on ideas and code from CFQ:

6340

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6341

++ *

6342

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6343

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6344

++ *

6345

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6346

++ */

6347

++

6348

++#ifndef _BFQ_H

6349

++#define _BFQ_H

6350

++

6351

++#include <linux/blktrace_api.h>

6352

++#include <linux/hrtimer.h>

6353

++#include <linux/ioprio.h>

6354

++#include <linux/rbtree.h>

6355

++

6356

++#define BFQ_IOPRIO_CLASSES	3

6357

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

6358

++

6359

++#define BFQ_MIN_WEIGHT	1

6360

++#define BFQ_MAX_WEIGHT	1000

6361

++

6362

++#define BFQ_DEFAULT_QUEUE_IOPRIO	4

6363

++

6364

++#define BFQ_DEFAULT_GRP_WEIGHT	10

6365

++#define BFQ_DEFAULT_GRP_IOPRIO	0

6366

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

6367

++

6368

++struct bfq_entity;

6369

++

6370

++/**

6371

++ * struct bfq_service_tree - per ioprio_class service tree.

6372

++ * @active: tree for active entities (i.e., those backlogged).

6373

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

6374

++ * @first_idle: idle entity with minimum F_i.

6375

++ * @last_idle: idle entity with maximum F_i.

6376

++ * @vtime: scheduler virtual time.

6377

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

6378

++ *

6379

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

6380

++ * ioprio_class has its own independent scheduler, and so its own

6381

++ * bfq_service_tree.  All the fields are protected by the queue lock

6382

++ * of the containing bfqd.

6383

++ */

6384

++struct bfq_service_tree {

6385

++	struct rb_root active;

6386

++	struct rb_root idle;

6387

++

6388

++	struct bfq_entity *first_idle;

6389

++	struct bfq_entity *last_idle;

6390

++

6391

++	u64 vtime;

6392

++	unsigned long wsum;

6393

++};

6394

++

6395

++/**

6396

++ * struct bfq_sched_data - multi-class scheduler.

6397

++ * @in_service_entity: entity in service.

6398

++ * @next_in_service: head-of-the-line entity in the scheduler.

6399

++ * @service_tree: array of service trees, one per ioprio_class.

6400

++ *

6401

++ * bfq_sched_data is the basic scheduler queue.  It supports three

6402

++ * ioprio_classes, and can be used either as a toplevel queue or as

6403

++ * an intermediate queue on a hierarchical setup.

6404

++ * @next_in_service points to the active entity of the sched_data

6405

++ * service trees that will be scheduled next.

6406

++ *

6407

++ * The supported ioprio_classes are the same as in CFQ, in descending

6408

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

6409

++ * Requests from higher priority queues are served before all the

6410

++ * requests from lower priority queues; among requests of the same

6411

++ * queue requests are served according to B-WF2Q+.

6412

++ * All the fields are protected by the queue lock of the containing bfqd.

6413

++ */

6414

++struct bfq_sched_data {

6415

++	struct bfq_entity *in_service_entity;

6416

++	struct bfq_entity *next_in_service;

6417

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

6418

++};

6419

++

6420

++/**

6421

++ * struct bfq_weight_counter - counter of the number of all active entities

6422

++ *                             with a given weight.

6423

++ * @weight: weight of the entities that this counter refers to.

6424

++ * @num_active: number of active entities with this weight.

6425

++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

6426

++ *                and @group_weights_tree).

6427

++ */

6428

++struct bfq_weight_counter {

6429

++	short int weight;

6430

++	unsigned int num_active;

6431

++	struct rb_node weights_node;

6432

++};

6433

++

6434

++/**

6435

++ * struct bfq_entity - schedulable entity.

6436

++ * @rb_node: service_tree member.

6437

++ * @weight_counter: pointer to the weight counter associated with this entity.

6438

++ * @on_st: flag, true if the entity is on a tree (either the active or

6439

++ *         the idle one of its service_tree).

6440

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

6441

++ * @start: B-WF2Q+ start timestamp (aka S_i).

6442

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

6443

++ * @min_start: minimum start time of the (active) subtree rooted at

6444

++ *             this entity; used for O(log N) lookups into active trees.

6445

++ * @service: service received during the last round of service.

6446

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

6447

++ * @weight: weight of the queue

6448

++ * @parent: parent entity, for hierarchical scheduling.

6449

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

6450

++ *                 associated scheduler queue, %NULL on leaf nodes.

6451

++ * @sched_data: the scheduler queue this entity belongs to.

6452

++ * @ioprio: the ioprio in use.

6453

++ * @new_weight: when a weight change is requested, the new weight value.

6454

++ * @orig_weight: original weight, used to implement weight boosting

6455

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

6456

++ * @ioprio_class: the ioprio_class in use.

6457

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

6458

++ *                    ioprio_class value.

6459

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

6460

++ *                  ioprio_class change.

6461

++ *

6462

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

6463

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

6464

++ * entity belongs to the sched_data of the parent group in the cgroup

6465

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

6466

++ * in @my_sched_data.

6467

++ *

6468

++ * Each entity stores independently its priority values; this would

6469

++ * allow different weights on different devices, but this

6470

++ * functionality is not exported to userspace by now.  Priorities and

6471

++ * weights are updated lazily, first storing the new values into the

6472

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

6473

++ * there is a transition in the entity state that allows the priority

6474

++ * update to take place the effective and the requested priority

6475

++ * values are synchronized.

6476

++ *

6477

++ * Unless cgroups are used, the weight value is calculated from the

6478

++ * ioprio to export the same interface as CFQ.  When dealing with

6479

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

6480

++ * time to consume their budget and have true sequential behavior, and

6481

++ * when there are no external factors breaking anticipation) the

6482

++ * relative weights at each level of the cgroups hierarchy should be

6483

++ * guaranteed.  All the fields are protected by the queue lock of the

6484

++ * containing bfqd.

6485

++ */

6486

++struct bfq_entity {

6487

++	struct rb_node rb_node;

6488

++	struct bfq_weight_counter *weight_counter;

6489

++

6490

++	int on_st;

6491

++

6492

++	u64 finish;

6493

++	u64 start;

6494

++

6495

++	struct rb_root *tree;

6496

++

6497

++	u64 min_start;

6498

++

6499

++	unsigned long service, budget;

6500

++	unsigned short weight, new_weight;

6501

++	unsigned short orig_weight;

6502

++

6503

++	struct bfq_entity *parent;

6504

++

6505

++	struct bfq_sched_data *my_sched_data;

6506

++	struct bfq_sched_data *sched_data;

6507

++

6508

++	unsigned short ioprio, new_ioprio;

6509

++	unsigned short ioprio_class, new_ioprio_class;

6510

++

6511

++	int ioprio_changed;

6512

++};

6513

++

6514

++struct bfq_group;

6515

++

6516

++/**

6517

++ * struct bfq_queue - leaf schedulable entity.

6518

++ * @ref: reference counter.

6519

++ * @bfqd: parent bfq_data.

6520

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

6521

++ *           one or more other queues.

6522

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

6523

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

6524

++ * @sort_list: sorted list of pending requests.

6525

++ * @next_rq: if fifo isn't expired, next request to serve.

6526

++ * @queued: nr of requests queued in @sort_list.

6527

++ * @allocated: currently allocated requests.

6528

++ * @meta_pending: pending metadata requests.

6529

++ * @fifo: fifo list of requests in sort_list.

6530

++ * @entity: entity representing this queue in the scheduler.

6531

++ * @max_budget: maximum budget allowed from the feedback mechanism.

6532

++ * @budget_timeout: budget expiration (in jiffies).

6533

++ * @dispatched: number of requests on the dispatch list or inside driver.

6534

++ * @flags: status flags.

6535

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

6536

++ * @burst_list_node: node for the device's burst list.

6537

++ * @seek_samples: number of seeks sampled

6538

++ * @seek_total: sum of the distances of the seeks sampled

6539

++ * @seek_mean: mean seek distance

6540

++ * @last_request_pos: position of the last request enqueued

6541

++ * @requests_within_timer: number of consecutive pairs of request completion

6542

++ *                         and arrival, such that the queue becomes idle

6543

++ *                         after the completion, but the next request arrives

6544

++ *                         within an idle time slice; used only if the queue's

6545

++ *                         IO_bound has been cleared.

6546

++ * @pid: pid of the process owning the queue, used for logging purposes.

6547

++ * @last_wr_start_finish: start time of the current weight-raising period if

6548

++ *                        the @bfq-queue is being weight-raised, otherwise

6549

++ *                        finish time of the last weight-raising period

6550

++ * @wr_cur_max_time: current max raising time for this queue

6551

++ * @soft_rt_next_start: minimum time instant such that, only if a new

6552

++ *                      request is enqueued after this time instant in an

6553

++ *                      idle @bfq_queue with no outstanding requests, then

6554

++ *                      the task associated with the queue it is deemed as

6555

++ *                      soft real-time (see the comments to the function

6556

++ *                      bfq_bfqq_softrt_next_start()).

6557

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

6558

++ *                      idle to backlogged

6559

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

6560

++ *                           since the last transition from idle to

6561

++ *                           backlogged

6562

++ *

6563

++ * A bfq_queue is a leaf request queue; it can be associated with an io_context

6564

++ * or more, if it is async or shared between cooperating processes. @cgroup

6565

++ * holds a reference to the cgroup, to be sure that it does not disappear while

6566

++ * a bfqq still references it (mostly to avoid races between request issuing and

6567

++ * task migration followed by cgroup destruction).

6568

++ * All the fields are protected by the queue lock of the containing bfqd.

6569

++ */

6570

++struct bfq_queue {

6571

++	atomic_t ref;

6572

++	struct bfq_data *bfqd;

6573

++

6574

++	/* fields for cooperating queues handling */

6575

++	struct bfq_queue *new_bfqq;

6576

++	struct rb_node pos_node;

6577

++	struct rb_root *pos_root;

6578

++

6579

++	struct rb_root sort_list;

6580

++	struct request *next_rq;

6581

++	int queued[2];

6582

++	int allocated[2];

6583

++	int meta_pending;

6584

++	struct list_head fifo;

6585

++

6586

++	struct bfq_entity entity;

6587

++

6588

++	unsigned long max_budget;

6589

++	unsigned long budget_timeout;

6590

++

6591

++	int dispatched;

6592

++

6593

++	unsigned int flags;

6594

++

6595

++	struct list_head bfqq_list;

6596

++

6597

++	struct hlist_node burst_list_node;

6598

++

6599

++	unsigned int seek_samples;

6600

++	u64 seek_total;

6601

++	sector_t seek_mean;

6602

++	sector_t last_request_pos;

6603

++

6604

++	unsigned int requests_within_timer;

6605

++

6606

++	pid_t pid;

6607

++

6608

++	/* weight-raising fields */

6609

++	unsigned long wr_cur_max_time;

6610

++	unsigned long soft_rt_next_start;

6611

++	unsigned long last_wr_start_finish;

6612

++	unsigned int wr_coeff;

6613

++	unsigned long last_idle_bklogged;

6614

++	unsigned long service_from_backlogged;

6615

++};

6616

++

6617

++/**

6618

++ * struct bfq_ttime - per process thinktime stats.

6619

++ * @ttime_total: total process thinktime

6620

++ * @ttime_samples: number of thinktime samples

6621

++ * @ttime_mean: average process thinktime

6622

++ */

6623

++struct bfq_ttime {

6624

++	unsigned long last_end_request;

6625

++

6626

++	unsigned long ttime_total;

6627

++	unsigned long ttime_samples;

6628

++	unsigned long ttime_mean;

6629

++};

6630

++

6631

++/**

6632

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

6633

++ * @icq: associated io_cq structure

6634

++ * @bfqq: array of two process queues, the sync and the async

6635

++ * @ttime: associated @bfq_ttime struct

6636

++ */

6637

++struct bfq_io_cq {

6638

++	struct io_cq icq; /* must be the first member */

6639

++	struct bfq_queue *bfqq[2];

6640

++	struct bfq_ttime ttime;

6641

++	int ioprio;

6642

++};

6643

++

6644

++enum bfq_device_speed {

6645

++	BFQ_BFQD_FAST,

6646

++	BFQ_BFQD_SLOW,

6647

++};

6648

++

6649

++/**

6650

++ * struct bfq_data - per device data structure.

6651

++ * @queue: request queue for the managed device.

6652

++ * @root_group: root bfq_group for the device.

6653

++ * @rq_pos_tree: rbtree sorted by next_request position, used when

6654

++ *               determining if two or more queues have interleaving

6655

++ *               requests (see bfq_close_cooperator()).

6656

++ * @active_numerous_groups: number of bfq_groups containing more than one

6657

++ *                          active @bfq_entity.

6658

++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

6659

++ *                      weight. Used to keep track of whether all @bfq_queues

6660

++ *                     have the same weight. The tree contains one counter

6661

++ *                     for each distinct weight associated to some active

6662

++ *                     and not weight-raised @bfq_queue (see the comments to

6663

++ *                      the functions bfq_weights_tree_[add|remove] for

6664

++ *                     further details).

6665

++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

6666

++ *                      by weight. Used to keep track of whether all

6667

++ *                     @bfq_groups have the same weight. The tree contains

6668

++ *                     one counter for each distinct weight associated to

6669

++ *                     some active @bfq_group (see the comments to the

6670

++ *                     functions bfq_weights_tree_[add|remove] for further

6671

++ *                     details).

6672

++ * @busy_queues: number of bfq_queues containing requests (including the

6673

++ *		 queue in service, even if it is idling).

6674

++ * @busy_in_flight_queues: number of @bfq_queues containing pending or

6675

++ *                         in-flight requests, plus the @bfq_queue in

6676

++ *                         service, even if idle but waiting for the

6677

++ *                         possible arrival of its next sync request. This

6678

++ *                         field is updated only if the device is rotational,

6679

++ *                         but used only if the device is also NCQ-capable.

6680

++ *                         The reason why the field is updated also for non-

6681

++ *                         NCQ-capable rotational devices is related to the

6682

++ *                         fact that the value of @hw_tag may be set also

6683

++ *                         later than when busy_in_flight_queues may need to

6684

++ *                         be incremented for the first time(s). Taking also

6685

++ *                         this possibility into account, to avoid unbalanced

6686

++ *                         increments/decrements, would imply more overhead

6687

++ *                         than just updating busy_in_flight_queues

6688

++ *                         regardless of the value of @hw_tag.

6689

++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

6690

++ *                                     (that is, seeky queues that expired

6691

++ *                                     for budget timeout at least once)

6692

++ *                                     containing pending or in-flight

6693

++ *                                     requests, including the in-service

6694

++ *                                     @bfq_queue if constantly seeky. This

6695

++ *                                     field is updated only if the device

6696

++ *                                     is rotational, but used only if the

6697

++ *                                     device is also NCQ-capable (see the

6698

++ *                                     comments to @busy_in_flight_queues).

6699

++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.

6700

++ * @queued: number of queued requests.

6701

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

6702

++ * @sync_flight: number of sync requests in the driver.

6703

++ * @max_rq_in_driver: max number of reqs in driver in the last

6704

++ *                    @hw_tag_samples completed requests.

6705

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

6706

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

6707

++ * @budgets_assigned: number of budgets assigned.

6708

++ * @idle_slice_timer: timer set when idling for the next sequential request

6709

++ *                    from the queue in service.

6710

++ * @unplug_work: delayed work to restart dispatching on the request queue.

6711

++ * @in_service_queue: bfq_queue in service.

6712

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

6713

++ * @last_position: on-disk position of the last served request.

6714

++ * @last_budget_start: beginning of the last budget.

6715

++ * @last_idling_start: beginning of the last idle slice.

6716

++ * @peak_rate: peak transfer rate observed for a budget.

6717

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

6718

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before

6719

++ *                  rescheduling.

6720

++ * @group_list: list of all the bfq_groups active on the device.

6721

++ * @active_list: list of all the bfq_queues active on the device.

6722

++ * @idle_list: list of all the bfq_queues idle on the device.

6723

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

6724

++ *                   requests are served in fifo order.

6725

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

6726

++ * @bfq_back_max: maximum allowed backward seek.

6727

++ * @bfq_slice_idle: maximum idling time.

6728

++ * @bfq_user_max_budget: user-configured max budget value

6729

++ *                       (0 for auto-tuning).

6730

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

6731

++ *                           async queues.

6732

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

6733

++ *               to prevent seeky queues to impose long latencies to well

6734

++ *               behaved ones (this also implies that seeky queues cannot

6735

++ *               receive guarantees in the service domain; after a timeout

6736

++ *               they are charged for the whole allocated budget, to try

6737

++ *               to preserve a behavior reasonably fair among them, but

6738

++ *               without service-domain guarantees).

6739

++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

6740

++ *                   no more granted any weight-raising.

6741

++ * @bfq_failed_cooperations: number of consecutive failed cooperation

6742

++ *                           chances after which weight-raising is restored

6743

++ *                           to a queue subject to more than bfq_coop_thresh

6744

++ *                           queue merges.

6745

++ * @bfq_requests_within_timer: number of consecutive requests that must be

6746

++ *                             issued within the idle time slice to set

6747

++ *                             again idling to a queue which was marked as

6748

++ *                             non-I/O-bound (see the definition of the

6749

++ *                             IO_bound flag for further details).

6750

++ * @last_ins_in_burst: last time at which a queue entered the current

6751

++ *                     burst of queues being activated shortly after

6752

++ *                     each other; for more details about this and the

6753

++ *                     following parameters related to a burst of

6754

++ *                     activations, see the comments to the function

6755

++ *                     @bfq_handle_burst.

6756

++ * @bfq_burst_interval: reference time interval used to decide whether a

6757

++ *                      queue has been activated shortly after

6758

++ *                      @last_ins_in_burst.

6759

++ * @burst_size: number of queues in the current burst of queue activations.

6760

++ * @bfq_large_burst_thresh: maximum burst size above which the current

6761

++ * 			    queue-activation burst is deemed as 'large'.

6762

++ * @large_burst: true if a large queue-activation burst is in progress.

6763

++ * @burst_list: head of the burst list (as for the above fields, more details

6764

++ * 		in the comments to the function bfq_handle_burst).

6765

++ * @low_latency: if set to true, low-latency heuristics are enabled.

6766

++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

6767

++ *                queue is multiplied.

6768

++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

6769

++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

6770

++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

6771

++ *			  may be reactivated for a queue (in jiffies).

6772

++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

6773

++ *				after which weight-raising may be

6774

++ *				reactivated for an already busy queue

6775

++ *				(in jiffies).

6776

++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

6777

++ *			    sectors per seconds.

6778

++ * @RT_prod: cached value of the product R*T used for computing the maximum

6779

++ *	     duration of the weight raising automatically.

6780

++ * @device_speed: device-speed class for the low-latency heuristic.

6781

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

6782

++ *

6783

++ * All the fields are protected by the @queue lock.

6784

++ */

6785

++struct bfq_data {

6786

++	struct request_queue *queue;

6787

++

6788

++	struct bfq_group *root_group;

6789

++	struct rb_root rq_pos_tree;

6790

++

6791

++#ifdef CONFIG_CGROUP_BFQIO

6792

++	int active_numerous_groups;

6793

++#endif

6794

++

6795

++	struct rb_root queue_weights_tree;

6796

++	struct rb_root group_weights_tree;

6797

++

6798

++	int busy_queues;

6799

++	int busy_in_flight_queues;

6800

++	int const_seeky_busy_in_flight_queues;

6801

++	int wr_busy_queues;

6802

++	int queued;

6803

++	int rq_in_driver;

6804

++	int sync_flight;

6805

++

6806

++	int max_rq_in_driver;

6807

++	int hw_tag_samples;

6808

++	int hw_tag;

6809

++

6810

++	int budgets_assigned;

6811

++

6812

++	struct timer_list idle_slice_timer;

6813

++	struct work_struct unplug_work;

6814

++

6815

++	struct bfq_queue *in_service_queue;

6816

++	struct bfq_io_cq *in_service_bic;

6817

++

6818

++	sector_t last_position;

6819

++

6820

++	ktime_t last_budget_start;

6821

++	ktime_t last_idling_start;

6822

++	int peak_rate_samples;

6823

++	u64 peak_rate;

6824

++	unsigned long bfq_max_budget;

6825

++

6826

++	struct hlist_head group_list;

6827

++	struct list_head active_list;

6828

++	struct list_head idle_list;

6829

++

6830

++	unsigned int bfq_fifo_expire[2];

6831

++	unsigned int bfq_back_penalty;

6832

++	unsigned int bfq_back_max;

6833

++	unsigned int bfq_slice_idle;

6834

++	u64 bfq_class_idle_last_service;

6835

++

6836

++	unsigned int bfq_user_max_budget;

6837

++	unsigned int bfq_max_budget_async_rq;

6838

++	unsigned int bfq_timeout[2];

6839

++

6840

++	unsigned int bfq_coop_thresh;

6841

++	unsigned int bfq_failed_cooperations;

6842

++	unsigned int bfq_requests_within_timer;

6843

++

6844

++	unsigned long last_ins_in_burst;

6845

++	unsigned long bfq_burst_interval;

6846

++	int burst_size;

6847

++	unsigned long bfq_large_burst_thresh;

6848

++	bool large_burst;

6849

++	struct hlist_head burst_list;

6850

++

6851

++	bool low_latency;

6852

++

6853

++	/* parameters of the low_latency heuristics */

6854

++	unsigned int bfq_wr_coeff;

6855

++	unsigned int bfq_wr_max_time;

6856

++	unsigned int bfq_wr_rt_max_time;

6857

++	unsigned int bfq_wr_min_idle_time;

6858

++	unsigned long bfq_wr_min_inter_arr_async;

6859

++	unsigned int bfq_wr_max_softrt_rate;

6860

++	u64 RT_prod;

6861

++	enum bfq_device_speed device_speed;

6862

++

6863

++	struct bfq_queue oom_bfqq;

6864

++};

6865

++

6866

++enum bfqq_state_flags {

6867

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

6868

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6869

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6870

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6871

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6872

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6873

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6874

++	BFQ_BFQQ_FLAG_IO_bound,         /*

6875

++					 * bfqq has timed-out at least once

6876

++					 * having consumed at most 2/10 of

6877

++					 * its budget

6878

++					 */

6879

++	BFQ_BFQQ_FLAG_in_large_burst,	/*

6880

++					 * bfqq activated in a large burst,

6881

++					 * see comments to bfq_handle_burst.

6882

++					 */

6883

++	BFQ_BFQQ_FLAG_constantly_seeky,	/*

6884

++					 * bfqq has proved to be slow and

6885

++					 * seeky until budget timeout

6886

++					 */

6887

++	BFQ_BFQQ_FLAG_softrt_update,    /*

6888

++					 * may need softrt-next-start

6889

++					 * update

6890

++					 */

6891

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6892

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6893

++};

6894

++

6895

++#define BFQ_BFQQ_FNS(name)						\

6896

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6897

++{									\

6898

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6899

++}									\

6900

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6901

++{									\

6902

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6903

++}									\

6904

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6905

++{									\

6906

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6907

++}

6908

++

6909

++BFQ_BFQQ_FNS(busy);

6910

++BFQ_BFQQ_FNS(wait_request);

6911

++BFQ_BFQQ_FNS(must_alloc);

6912

++BFQ_BFQQ_FNS(fifo_expire);

6913

++BFQ_BFQQ_FNS(idle_window);

6914

++BFQ_BFQQ_FNS(sync);

6915

++BFQ_BFQQ_FNS(budget_new);

6916

++BFQ_BFQQ_FNS(IO_bound);

6917

++BFQ_BFQQ_FNS(in_large_burst);

6918

++BFQ_BFQQ_FNS(constantly_seeky);

6919

++BFQ_BFQQ_FNS(coop);

6920

++BFQ_BFQQ_FNS(split_coop);

6921

++BFQ_BFQQ_FNS(softrt_update);

6922

++#undef BFQ_BFQQ_FNS

6923

++

6924

++/* Logging facilities. */

6925

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6926

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6927

++

6928

++#define bfq_log(bfqd, fmt, args...) \

6929

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6930

++

6931

++/* Expiration reasons. */

6932

++enum bfqq_expiration {

6933

++	BFQ_BFQQ_TOO_IDLE = 0,		/*

6934

++					 * queue has been idling for

6935

++					 * too long

6936

++					 */

6937

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6938

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6939

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6940

++};

6941

++

6942

++#ifdef CONFIG_CGROUP_BFQIO

6943

++/**

6944

++ * struct bfq_group - per (device, cgroup) data structure.

6945

++ * @entity: schedulable entity to insert into the parent group sched_data.

6946

++ * @sched_data: own sched_data, to contain child entities (they may be

6947

++ *              both bfq_queues and bfq_groups).

6948

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6949

++ *              list of the containing cgroup's bfqio_cgroup.

6950

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6951

++ *             of the groups active on the same device; used for cleanup.

6952

++ * @bfqd: the bfq_data for the device this group acts upon.

6953

++ * @async_bfqq: array of async queues for all the tasks belonging to

6954

++ *              the group, one queue per ioprio value per ioprio_class,

6955

++ *              except for the idle class that has only one queue.

6956

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6957

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6958

++ *             to avoid too many special cases during group creation/

6959

++ *             migration.

6960

++ * @active_entities: number of active entities belonging to the group;

6961

++ *                   unused for the root group. Used to know whether there

6962

++ *                   are groups with more than one active @bfq_entity

6963

++ *                   (see the comments to the function

6964

++ *                   bfq_bfqq_must_not_expire()).

6965

++ *

6966

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6967

++ * there is a set of bfq_groups, each one collecting the lower-level

6968

++ * entities belonging to the group that are acting on the same device.

6969

++ *

6970

++ * Locking works as follows:

6971

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6972

++ *      via RCU from its readers.

6973

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

6974

++ *      from the readers.

6975

++ *    o All the other fields are protected by the @bfqd queue lock.

6976

++ */

6977

++struct bfq_group {

6978

++	struct bfq_entity entity;

6979

++	struct bfq_sched_data sched_data;

6980

++

6981

++	struct hlist_node group_node;

6982

++	struct hlist_node bfqd_node;

6983

++

6984

++	void *bfqd;

6985

++

6986

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6987

++	struct bfq_queue *async_idle_bfqq;

6988

++

6989

++	struct bfq_entity *my_entity;

6990

++

6991

++	int active_entities;

6992

++};

6993

++

6994

++/**

6995

++ * struct bfqio_cgroup - bfq cgroup data structure.

6996

++ * @css: subsystem state for bfq in the containing cgroup.

6997

++ * @online: flag marked when the subsystem is inserted.

6998

++ * @weight: cgroup weight.

6999

++ * @ioprio: cgroup ioprio.

7000

++ * @ioprio_class: cgroup ioprio_class.

7001

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

7002

++ * @group_data: list containing the bfq_group belonging to this cgroup.

7003

++ *

7004

++ * @group_data is accessed using RCU, with @lock protecting the updates,

7005

++ * @ioprio and @ioprio_class are protected by @lock.

7006

++ */

7007

++struct bfqio_cgroup {

7008

++	struct cgroup_subsys_state css;

7009

++	bool online;

7010

++

7011

++	unsigned short weight, ioprio, ioprio_class;

7012

++

7013

++	spinlock_t lock;

7014

++	struct hlist_head group_data;

7015

++};

7016

++#else

7017

++struct bfq_group {

7018

++	struct bfq_sched_data sched_data;

7019

++

7020

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7021

++	struct bfq_queue *async_idle_bfqq;

7022

++};

7023

++#endif

7024

++

7025

++static inline struct bfq_service_tree *

7026

++bfq_entity_service_tree(struct bfq_entity *entity)

7027

++{

7028

++	struct bfq_sched_data *sched_data = entity->sched_data;

7029

++	unsigned int idx = entity->ioprio_class - 1;

7030

++

7031

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

7032

++	BUG_ON(sched_data == NULL);

7033

++

7034

++	return sched_data->service_tree + idx;

7035

++}

7036

++

7037

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

7038

++					    bool is_sync)

7039

++{

7040

++	return bic->bfqq[is_sync];

7041

++}

7042

++

7043

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

7044

++				struct bfq_queue *bfqq, bool is_sync)

7045

++{

7046

++	bic->bfqq[is_sync] = bfqq;

7047

++}

7048

++

7049

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

7050

++{

7051

++	return bic->icq.q->elevator->elevator_data;

7052

++}

7053

++

7054

++/**

7055

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

7056

++ * @ptr: a pointer to a bfqd.

7057

++ * @flags: storage for the flags to be saved.

7058

++ *

7059

++ * This function allows bfqg->bfqd to be protected by the

7060

++ * queue lock of the bfqd they reference; the pointer is dereferenced

7061

++ * under RCU, so the storage for bfqd is assured to be safe as long

7062

++ * as the RCU read side critical section does not end.  After the

7063

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

7064

++ * sure that no other writer accessed it.  If we raced with a writer,

7065

++ * the function returns NULL, with the queue unlocked, otherwise it

7066

++ * returns the dereferenced pointer, with the queue locked.

7067

++ */

7068

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

7069

++						   unsigned long *flags)

7070

++{

7071

++	struct bfq_data *bfqd;

7072

++

7073

++	rcu_read_lock();

7074

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

7075

++

7076

++	if (bfqd != NULL) {

7077

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

7078

++		if (*ptr == bfqd)

7079

++			goto out;

7080

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7081

++	}

7082

++

7083

++	bfqd = NULL;

7084

++out:

7085

++	rcu_read_unlock();

7086

++	return bfqd;

7087

++}

7088

++

7089

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

7090

++				       unsigned long *flags)

7091

++{

7092

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7093

++}

7094

++

7095

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic);

7096

++static void bfq_put_queue(struct bfq_queue *bfqq);

7097

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

7098

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

7099

++				       struct bfq_group *bfqg, int is_sync,

7100

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

7101

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

7102

++				    struct bfq_group *bfqg);

7103

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

7104

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7105

++

7106

++#endif /* _BFQ_H */

7107

+--

7108

+1.9.1

7109

+

7110

7111

diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch

7112

new file mode 100644

7113

index 0000000..305a5b0

7114

--- /dev/null

7115

+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch

7116

@@ -0,0 +1,1220 @@

7117

+From 44efc3f611c09e049fe840e640c2bd2ccfde2148 Mon Sep 17 00:00:00 2001

7118

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

7119

+Date: Fri, 5 Jun 2015 17:45:40 +0200

7120

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r8 for

7121

+ 4.3.0

7122

+

7123

+A set of processes may happen  to  perform interleaved reads, i.e.,requests

7124

+whose union would give rise to a  sequential read  pattern.  There are two

7125

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

7126

+data at a fixed distance from each other, while in the second case processes

7127

+may read variable-size chunks at  variable distances. The latter case occurs

7128

+for  example with  QEMU, which  splits the  I/O generated  by the  guest into

7129

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

7130

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

7131

+available  process. CFQ  uses actual  queue merging  for the  first type of

7132

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

7133

+of the read requests  performed by the second type of  processes. In the end

7134

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

7135

+throughput with interleaved I/O.

7136

+

7137

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

7138

+sequential  read pattern  with both  types of  processes. The  main idea is

7139

+checking newly arrived requests against the next request of the active queue

7140

+both in case of actual request insert and in case of request merge. By doing

7141

+so, both the types of processes can be handled by just merging their queues.

7142

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

7143

+CFQ.

7144

+

7145

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

7146

+properly restoring the weight-raising state of a queue when it gets back to

7147

+a non-merged state.

7148

+

7149

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

7150

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

7151

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7152

+---

7153

+ block/bfq-iosched.c | 750 +++++++++++++++++++++++++++++++++++++---------------

7154

+ block/bfq-sched.c   |  28 --

7155

+ block/bfq.h         |  54 +++-

7156

+ 3 files changed, 580 insertions(+), 252 deletions(-)

7157

+

7158

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7159

+index 773b2ee..71b51c1 100644

7160

+--- a/block/bfq-iosched.c

7161

++++ b/block/bfq-iosched.c

7162

+@@ -573,6 +573,57 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

7163

+ 	return dur;

7164

+ }

7165

+

7166

++static inline unsigned

7167

++bfq_bfqq_cooperations(struct bfq_queue *bfqq)

7168

++{

7169

++	return bfqq->bic ? bfqq->bic->cooperations : 0;

7170

++}

7171

++

7172

++static inline void

7173

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

7174

++{

7175

++	if (bic->saved_idle_window)

7176

++		bfq_mark_bfqq_idle_window(bfqq);

7177

++	else

7178

++		bfq_clear_bfqq_idle_window(bfqq);

7179

++	if (bic->saved_IO_bound)

7180

++		bfq_mark_bfqq_IO_bound(bfqq);

7181

++	else

7182

++		bfq_clear_bfqq_IO_bound(bfqq);

7183

++	/* Assuming that the flag in_large_burst is already correctly set */

7184

++	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

7185

++	    !bfq_bfqq_in_large_burst(bfqq) &&

7186

++	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

7187

++		/*

7188

++		 * Start a weight raising period with the duration given by

7189

++		 * the raising_time_left snapshot.

7190

++		 */

7191

++		if (bfq_bfqq_busy(bfqq))

7192

++			bfqq->bfqd->wr_busy_queues++;

7193

++		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

7194

++		bfqq->wr_cur_max_time = bic->wr_time_left;

7195

++		bfqq->last_wr_start_finish = jiffies;

7196

++		bfqq->entity.ioprio_changed = 1;

7197

++	}

7198

++	/*

7199

++	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

7200

++	 * getting confused about the queue's need of a weight-raising

7201

++	 * period.

7202

++	 */

7203

++	bic->wr_time_left = 0;

7204

++}

7205

++

7206

++/* Must be called with the queue_lock held. */

7207

++static int bfqq_process_refs(struct bfq_queue *bfqq)

7208

++{

7209

++	int process_refs, io_refs;

7210

++

7211

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7212

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7213

++	BUG_ON(process_refs < 0);

7214

++	return process_refs;

7215

++}

7216

++

7217

+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

7218

+ static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

7219

+ 					struct bfq_queue *bfqq)

7220

+@@ -817,7 +868,7 @@ static void bfq_add_request(struct request *rq)

7221

+ 		bfq_rq_pos_tree_add(bfqd, bfqq);

7222

+

7223

+ 	if (!bfq_bfqq_busy(bfqq)) {

7224

+-		bool soft_rt,

7225

++		bool soft_rt, coop_or_in_burst,

7226

+ 		     idle_for_long_time = time_is_before_jiffies(

7227

+ 						bfqq->budget_timeout +

7228

+ 						bfqd->bfq_wr_min_idle_time);

7229

+@@ -841,11 +892,12 @@ static void bfq_add_request(struct request *rq)

7230

+ 				bfqd->last_ins_in_burst = jiffies;

7231

+ 		}

7232

+

7233

++		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

7234

++			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

7235

+ 		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

7236

+-			!bfq_bfqq_in_large_burst(bfqq) &&

7237

++			!coop_or_in_burst &&

7238

+ 			time_is_before_jiffies(bfqq->soft_rt_next_start);

7239

+-		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

7240

+-			      idle_for_long_time;

7241

++		interactive = !coop_or_in_burst && idle_for_long_time;

7242

+ 		entity->budget = max_t(unsigned long, bfqq->max_budget,

7243

+ 				       bfq_serv_to_charge(next_rq, bfqq));

7244

+

7245

+@@ -864,11 +916,20 @@ static void bfq_add_request(struct request *rq)

7246

+ 		if (!bfqd->low_latency)

7247

+ 			goto add_bfqq_busy;

7248

+

7249

++		if (bfq_bfqq_just_split(bfqq))

7250

++			goto set_ioprio_changed;

7251

++

7252

+ 		/*

7253

+-		 * If the queue is not being boosted and has been idle

7254

+-		 * for enough time, start a weight-raising period

7255

++		 * If the queue:

7256

++		 * - is not being boosted,

7257

++		 * - has been idle for enough time,

7258

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

7259

++		 *   shared "for its nature" or it is not shared and its

7260

++		 *   requests have not been redirected to a shared queue)

7261

++		 * start a weight-raising period.

7262

+ 		 */

7263

+-		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

7264

++		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

7265

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

7266

+ 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

7267

+ 			if (interactive)

7268

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7269

+@@ -882,7 +943,7 @@ static void bfq_add_request(struct request *rq)

7270

+ 		} else if (old_wr_coeff > 1) {

7271

+ 			if (interactive)

7272

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7273

+-			else if (bfq_bfqq_in_large_burst(bfqq) ||

7274

++			else if (coop_or_in_burst ||

7275

+ 				 (bfqq->wr_cur_max_time ==

7276

+ 				  bfqd->bfq_wr_rt_max_time &&

7277

+ 				  !soft_rt)) {

7278

+@@ -901,18 +962,18 @@ static void bfq_add_request(struct request *rq)

7279

+ 				/*

7280

+ 				 *

7281

+ 				 * The remaining weight-raising time is lower

7282

+-				 * than bfqd->bfq_wr_rt_max_time, which

7283

+-				 * means that the application is enjoying

7284

+-				 * weight raising either because deemed soft-

7285

+-				 * rt in the near past, or because deemed

7286

+-				 * interactive a long ago. In both cases,

7287

+-				 * resetting now the current remaining weight-

7288

+-				 * raising time for the application to the

7289

+-				 * weight-raising duration for soft rt

7290

+-				 * applications would not cause any latency

7291

+-				 * increase for the application (as the new

7292

+-				 * duration would be higher than the remaining

7293

+-				 * time).

7294

++				 * than bfqd->bfq_wr_rt_max_time, which means

7295

++				 * that the application is enjoying weight

7296

++				 * raising either because deemed soft-rt in

7297

++				 * the near past, or because deemed interactive

7298

++				 * a long ago.

7299

++				 * In both cases, resetting now the current

7300

++				 * remaining weight-raising time for the

7301

++				 * application to the weight-raising duration

7302

++				 * for soft rt applications would not cause any

7303

++				 * latency increase for the application (as the

7304

++				 * new duration would be higher than the

7305

++				 * remaining time).

7306

+ 				 *

7307

+ 				 * In addition, the application is now meeting

7308

+ 				 * the requirements for being deemed soft rt.

7309

+@@ -947,6 +1008,7 @@ static void bfq_add_request(struct request *rq)

7310

+ 					bfqd->bfq_wr_rt_max_time;

7311

+ 			}

7312

+ 		}

7313

++set_ioprio_changed:

7314

+ 		if (old_wr_coeff != bfqq->wr_coeff)

7315

+ 			entity->ioprio_changed = 1;

7316

+ add_bfqq_busy:

7317

+@@ -1167,90 +1229,35 @@ static void bfq_end_wr(struct bfq_data *bfqd)

7318

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

7319

+ }

7320

+

7321

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7322

+-			   struct bio *bio)

7323

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

7324

+ {

7325

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

7326

+-	struct bfq_io_cq *bic;

7327

+-	struct bfq_queue *bfqq;

7328

+-

7329

+-	/*

7330

+-	 * Disallow merge of a sync bio into an async request.

7331

+-	 */

7332

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7333

+-		return 0;

7334

+-

7335

+-	/*

7336

+-	 * Lookup the bfqq that this bio will be queued with. Allow

7337

+-	 * merge only if rq is queued there.

7338

+-	 * Queue lock is held here.

7339

+-	 */

7340

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

7341

+-	if (bic == NULL)

7342

+-		return 0;

7343

+-

7344

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7345

+-	return bfqq == RQ_BFQQ(rq);

7346

+-}

7347

+-

7348

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7349

+-				       struct bfq_queue *bfqq)

7350

+-{

7351

+-	if (bfqq != NULL) {

7352

+-		bfq_mark_bfqq_must_alloc(bfqq);

7353

+-		bfq_mark_bfqq_budget_new(bfqq);

7354

+-		bfq_clear_bfqq_fifo_expire(bfqq);

7355

+-

7356

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7357

+-

7358

+-		bfq_log_bfqq(bfqd, bfqq,

7359

+-			     "set_in_service_queue, cur-budget = %lu",

7360

+-			     bfqq->entity.budget);

7361

+-	}

7362

+-

7363

+-	bfqd->in_service_queue = bfqq;

7364

+-}

7365

+-

7366

+-/*

7367

+- * Get and set a new queue for service.

7368

+- */

7369

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

7370

+-						  struct bfq_queue *bfqq)

7371

+-{

7372

+-	if (!bfqq)

7373

+-		bfqq = bfq_get_next_queue(bfqd);

7374

++	if (request)

7375

++		return blk_rq_pos(io_struct);

7376

+ 	else

7377

+-		bfq_get_next_queue_forced(bfqd, bfqq);

7378

+-

7379

+-	__bfq_set_in_service_queue(bfqd, bfqq);

7380

+-	return bfqq;

7381

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

7382

+ }

7383

+

7384

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

7385

+-					  struct request *rq)

7386

++static inline sector_t bfq_dist_from(sector_t pos1,

7387

++				     sector_t pos2)

7388

+ {

7389

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

7390

+-		return blk_rq_pos(rq) - bfqd->last_position;

7391

++	if (pos1 >= pos2)

7392

++		return pos1 - pos2;

7393

+ 	else

7394

+-		return bfqd->last_position - blk_rq_pos(rq);

7395

++		return pos2 - pos1;

7396

+ }

7397

+

7398

+-/*

7399

+- * Return true if bfqq has no request pending and rq is close enough to

7400

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

7401

+- * bfqq->next_rq

7402

+- */

7403

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

7404

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

7405

++					 sector_t sector)

7406

+ {

7407

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

7408

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

7409

++	       BFQQ_SEEK_THR;

7410

+ }

7411

+

7412

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7413

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

7414

+ {

7415

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

7416

+ 	struct rb_node *parent, *node;

7417

+ 	struct bfq_queue *__bfqq;

7418

+-	sector_t sector = bfqd->last_position;

7419

+

7420

+ 	if (RB_EMPTY_ROOT(root))

7421

+ 		return NULL;

7422

+@@ -1269,7 +1276,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7423

+ 	 * next_request position).

7424

+ 	 */

7425

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7426

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7427

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7428

+ 		return __bfqq;

7429

+

7430

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

7431

+@@ -1280,7 +1287,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7432

+ 		return NULL;

7433

+

7434

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

7435

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7436

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7437

+ 		return __bfqq;

7438

+

7439

+ 	return NULL;

7440

+@@ -1289,14 +1296,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7441

+ /*

7442

+  * bfqd - obvious

7443

+  * cur_bfqq - passed in so that we don't decide that the current queue

7444

+- *            is closely cooperating with itself.

7445

+- *

7446

+- * We are assuming that cur_bfqq has dispatched at least one request,

7447

+- * and that bfqd->last_position reflects a position on the disk associated

7448

+- * with the I/O issued by cur_bfqq.

7449

++ *            is closely cooperating with itself

7450

++ * sector - used as a reference point to search for a close queue

7451

+  */

7452

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7453

+-					      struct bfq_queue *cur_bfqq)

7454

++					      struct bfq_queue *cur_bfqq,

7455

++					      sector_t sector)

7456

+ {

7457

+ 	struct bfq_queue *bfqq;

7458

+

7459

+@@ -1316,7 +1321,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7460

+ 	 * working closely on the same area of the disk. In that case,

7461

+ 	 * we can group them together and don't waste time idling.

7462

+ 	 */

7463

+-	bfqq = bfqq_close(bfqd);

7464

++	bfqq = bfqq_close(bfqd, sector);

7465

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

7466

+ 		return NULL;

7467

+

7468

+@@ -1343,6 +1348,315 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7469

+ 	return bfqq;

7470

+ }

7471

+

7472

++static struct bfq_queue *

7473

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7474

++{

7475

++	int process_refs, new_process_refs;

7476

++	struct bfq_queue *__bfqq;

7477

++

7478

++	/*

7479

++	 * If there are no process references on the new_bfqq, then it is

7480

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7481

++	 * may have dropped their last reference (not just their last process

7482

++	 * reference).

7483

++	 */

7484

++	if (!bfqq_process_refs(new_bfqq))

7485

++		return NULL;

7486

++

7487

++	/* Avoid a circular list and skip interim queue merges. */

7488

++	while ((__bfqq = new_bfqq->new_bfqq)) {

7489

++		if (__bfqq == bfqq)

7490

++			return NULL;

7491

++		new_bfqq = __bfqq;

7492

++	}

7493

++

7494

++	process_refs = bfqq_process_refs(bfqq);

7495

++	new_process_refs = bfqq_process_refs(new_bfqq);

7496

++	/*

7497

++	 * If the process for the bfqq has gone away, there is no

7498

++	 * sense in merging the queues.

7499

++	 */

7500

++	if (process_refs == 0 || new_process_refs == 0)

7501

++		return NULL;

7502

++

7503

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7504

++		new_bfqq->pid);

7505

++

7506

++	/*

7507

++	 * Merging is just a redirection: the requests of the process

7508

++	 * owning one of the two queues are redirected to the other queue.

7509

++	 * The latter queue, in its turn, is set as shared if this is the

7510

++	 * first time that the requests of some process are redirected to

7511

++	 * it.

7512

++	 *

7513

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

7514

++	 * are in the context of the process owning bfqq, hence we have

7515

++	 * the io_cq of this process. So we can immediately configure this

7516

++	 * io_cq to redirect the requests of the process to new_bfqq.

7517

++	 *

7518

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

7519

++	 * io_cq of new_bfqq is not available, because, if the in-service

7520

++	 * queue is shared, bfqd->in_service_bic may not point to the

7521

++	 * io_cq of the in-service queue.

7522

++	 * Redirecting the requests of the process owning bfqq to the

7523

++	 * currently in-service queue is in any case the best option, as

7524

++	 * we feed the in-service queue with new requests close to the

7525

++	 * last request served and, by doing so, hopefully increase the

7526

++	 * throughput.

7527

++	 */

7528

++	bfqq->new_bfqq = new_bfqq;

7529

++	atomic_add(process_refs, &new_bfqq->ref);

7530

++	return new_bfqq;

7531

++}

7532

++

7533

++/*

7534

++ * Attempt to schedule a merge of bfqq with the currently in-service queue

7535

++ * or with a close queue among the scheduled queues.

7536

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

7537

++ * structure otherwise.

7538

++ *

7539

++ * The OOM queue is not allowed to participate to cooperation: in fact, since

7540

++ * the requests temporarily redirected to the OOM queue could be redirected

7541

++ * again to dedicated queues at any time, the state needed to correctly

7542

++ * handle merging with the OOM queue would be quite complex and expensive

7543

++ * to maintain. Besides, in such a critical condition as an out of memory,

7544

++ * the benefits of queue merging may be little relevant, or even negligible.

7545

++ */

7546

++static struct bfq_queue *

7547

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7548

++		     void *io_struct, bool request)

7549

++{

7550

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

7551

++

7552

++	if (bfqq->new_bfqq)

7553

++		return bfqq->new_bfqq;

7554

++

7555

++	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

7556

++		return NULL;

7557

++

7558

++	in_service_bfqq = bfqd->in_service_queue;

7559

++

7560

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

7561

++	    !bfqd->in_service_bic ||

7562

++	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

7563

++		goto check_scheduled;

7564

++

7565

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

7566

++		goto check_scheduled;

7567

++

7568

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

7569

++		goto check_scheduled;

7570

++

7571

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

7572

++		goto check_scheduled;

7573

++

7574

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

7575

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

7576

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

7577

++		if (new_bfqq != NULL)

7578

++			return new_bfqq; /* Merge with in-service queue */

7579

++	}

7580

++

7581

++	/*

7582

++	 * Check whether there is a cooperator among currently scheduled

7583

++	 * queues. The only thing we need is that the bio/request is not

7584

++	 * NULL, as we need it to establish whether a cooperator exists.

7585

++	 */

7586

++check_scheduled:

7587

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

7588

++					bfq_io_struct_pos(io_struct, request));

7589

++	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))

7590

++		return bfq_setup_merge(bfqq, new_bfqq);

7591

++

7592

++	return NULL;

7593

++}

7594

++

7595

++static inline void

7596

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

7597

++{

7598

++	/*

7599

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

7600

++	 * have already been redirected to a shared queue; both idle window

7601

++	 * and weight raising state have already been saved. Do nothing.

7602

++	 */

7603

++	if (bfqq->bic == NULL)

7604

++		return;

7605

++	if (bfqq->bic->wr_time_left)

7606

++		/*

7607

++		 * This is the queue of a just-started process, and would

7608

++		 * deserve weight raising: we set wr_time_left to the full

7609

++		 * weight-raising duration to trigger weight-raising when

7610

++		 * and if the queue is split and the first request of the

7611

++		 * queue is enqueued.

7612

++		 */

7613

++		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

7614

++	else if (bfqq->wr_coeff > 1) {

7615

++		unsigned long wr_duration =

7616

++			jiffies - bfqq->last_wr_start_finish;

7617

++		/*

7618

++		 * It may happen that a queue's weight raising period lasts

7619

++		 * longer than its wr_cur_max_time, as weight raising is

7620

++		 * handled only when a request is enqueued or dispatched (it

7621

++		 * does not use any timer). If the weight raising period is

7622

++		 * about to end, don't save it.

7623

++		 */

7624

++		if (bfqq->wr_cur_max_time <= wr_duration)

7625

++			bfqq->bic->wr_time_left = 0;

7626

++		else

7627

++			bfqq->bic->wr_time_left =

7628

++				bfqq->wr_cur_max_time - wr_duration;

7629

++		/*

7630

++		 * The bfq_queue is becoming shared or the requests of the

7631

++		 * process owning the queue are being redirected to a shared

7632

++		 * queue. Stop the weight raising period of the queue, as in

7633

++		 * both cases it should not be owned by an interactive or

7634

++		 * soft real-time application.

7635

++		 */

7636

++		bfq_bfqq_end_wr(bfqq);

7637

++	} else

7638

++		bfqq->bic->wr_time_left = 0;

7639

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

7640

++	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

7641

++	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

7642

++	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

7643

++	bfqq->bic->cooperations++;

7644

++	bfqq->bic->failed_cooperations = 0;

7645

++}

7646

++

7647

++static inline void

7648

++bfq_get_bic_reference(struct bfq_queue *bfqq)

7649

++{

7650

++	/*

7651

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

7652

++	 * is about to begin using a shared bfq_queue.

7653

++	 */

7654

++	if (bfqq->bic)

7655

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

7656

++}

7657

++

7658

++static void

7659

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7660

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7661

++{

7662

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7663

++		(long unsigned)new_bfqq->pid);

7664

++	/* Save weight raising and idle window of the merged queues */

7665

++	bfq_bfqq_save_state(bfqq);

7666

++	bfq_bfqq_save_state(new_bfqq);

7667

++	if (bfq_bfqq_IO_bound(bfqq))

7668

++		bfq_mark_bfqq_IO_bound(new_bfqq);

7669

++	bfq_clear_bfqq_IO_bound(bfqq);

7670

++	/*

7671

++	 * Grab a reference to the bic, to prevent it from being destroyed

7672

++	 * before being possibly touched by a bfq_split_bfqq().

7673

++	 */

7674

++	bfq_get_bic_reference(bfqq);

7675

++	bfq_get_bic_reference(new_bfqq);

7676

++	/*

7677

++	 * Merge queues (that is, let bic redirect its requests to new_bfqq)

7678

++	 */

7679

++	bic_set_bfqq(bic, new_bfqq, 1);

7680

++	bfq_mark_bfqq_coop(new_bfqq);

7681

++	/*

7682

++	 * new_bfqq now belongs to at least two bics (it is a shared queue):

7683

++	 * set new_bfqq->bic to NULL. bfqq either:

7684

++	 * - does not belong to any bic any more, and hence bfqq->bic must

7685

++	 *   be set to NULL, or

7686

++	 * - is a queue whose owning bics have already been redirected to a

7687

++	 *   different queue, hence the queue is destined to not belong to

7688

++	 *   any bic soon and bfqq->bic is already NULL (therefore the next

7689

++	 *   assignment causes no harm).

7690

++	 */

7691

++	new_bfqq->bic = NULL;

7692

++	bfqq->bic = NULL;

7693

++	bfq_put_queue(bfqq);

7694

++}

7695

++

7696

++static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

7697

++{

7698

++	struct bfq_io_cq *bic = bfqq->bic;

7699

++	struct bfq_data *bfqd = bfqq->bfqd;

7700

++

7701

++	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

7702

++		bic->failed_cooperations++;

7703

++		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

7704

++			bic->cooperations = 0;

7705

++	}

7706

++}

7707

++

7708

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7709

++			   struct bio *bio)

7710

++{

7711

++	struct bfq_data *bfqd = q->elevator->elevator_data;

7712

++	struct bfq_io_cq *bic;

7713

++	struct bfq_queue *bfqq, *new_bfqq;

7714

++

7715

++	/*

7716

++	 * Disallow merge of a sync bio into an async request.

7717

++	 */

7718

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7719

++		return 0;

7720

++

7721

++	/*

7722

++	 * Lookup the bfqq that this bio will be queued with. Allow

7723

++	 * merge only if rq is queued there.

7724

++	 * Queue lock is held here.

7725

++	 */

7726

++	bic = bfq_bic_lookup(bfqd, current->io_context);

7727

++	if (bic == NULL)

7728

++		return 0;

7729

++

7730

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7731

++	/*

7732

++	 * We take advantage of this function to perform an early merge

7733

++	 * of the queues of possible cooperating processes.

7734

++	 */

7735

++	if (bfqq != NULL) {

7736

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

7737

++		if (new_bfqq != NULL) {

7738

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

7739

++			/*

7740

++			 * If we get here, the bio will be queued in the

7741

++			 * shared queue, i.e., new_bfqq, so use new_bfqq

7742

++			 * to decide whether bio and rq can be merged.

7743

++			 */

7744

++			bfqq = new_bfqq;

7745

++		} else

7746

++			bfq_bfqq_increase_failed_cooperations(bfqq);

7747

++	}

7748

++

7749

++	return bfqq == RQ_BFQQ(rq);

7750

++}

7751

++

7752

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7753

++				       struct bfq_queue *bfqq)

7754

++{

7755

++	if (bfqq != NULL) {

7756

++		bfq_mark_bfqq_must_alloc(bfqq);

7757

++		bfq_mark_bfqq_budget_new(bfqq);

7758

++		bfq_clear_bfqq_fifo_expire(bfqq);

7759

++

7760

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7761

++

7762

++		bfq_log_bfqq(bfqd, bfqq,

7763

++			     "set_in_service_queue, cur-budget = %lu",

7764

++			     bfqq->entity.budget);

7765

++	}

7766

++

7767

++	bfqd->in_service_queue = bfqq;

7768

++}

7769

++

7770

++/*

7771

++ * Get and set a new queue for service.

7772

++ */

7773

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

7774

++{

7775

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

7776

++

7777

++	__bfq_set_in_service_queue(bfqd, bfqq);

7778

++	return bfqq;

7779

++}

7780

++

7781

+ /*

7782

+  * If enough samples have been computed, return the current max budget

7783

+  * stored in bfqd, which is dynamically updated according to the

7784

+@@ -1488,61 +1802,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

7785

+ 	return rq;

7786

+ }

7787

+

7788

+-/* Must be called with the queue_lock held. */

7789

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

7790

+-{

7791

+-	int process_refs, io_refs;

7792

+-

7793

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7794

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7795

+-	BUG_ON(process_refs < 0);

7796

+-	return process_refs;

7797

+-}

7798

+-

7799

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7800

+-{

7801

+-	int process_refs, new_process_refs;

7802

+-	struct bfq_queue *__bfqq;

7803

+-

7804

+-	/*

7805

+-	 * If there are no process references on the new_bfqq, then it is

7806

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7807

+-	 * may have dropped their last reference (not just their last process

7808

+-	 * reference).

7809

+-	 */

7810

+-	if (!bfqq_process_refs(new_bfqq))

7811

+-		return;

7812

+-

7813

+-	/* Avoid a circular list and skip interim queue merges. */

7814

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

7815

+-		if (__bfqq == bfqq)

7816

+-			return;

7817

+-		new_bfqq = __bfqq;

7818

+-	}

7819

+-

7820

+-	process_refs = bfqq_process_refs(bfqq);

7821

+-	new_process_refs = bfqq_process_refs(new_bfqq);

7822

+-	/*

7823

+-	 * If the process for the bfqq has gone away, there is no

7824

+-	 * sense in merging the queues.

7825

+-	 */

7826

+-	if (process_refs == 0 || new_process_refs == 0)

7827

+-		return;

7828

+-

7829

+-	/*

7830

+-	 * Merge in the direction of the lesser amount of work.

7831

+-	 */

7832

+-	if (new_process_refs >= process_refs) {

7833

+-		bfqq->new_bfqq = new_bfqq;

7834

+-		atomic_add(process_refs, &new_bfqq->ref);

7835

+-	} else {

7836

+-		new_bfqq->new_bfqq = bfqq;

7837

+-		atomic_add(new_process_refs, &bfqq->ref);

7838

+-	}

7839

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7840

+-		new_bfqq->pid);

7841

+-}

7842

+-

7843

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

7844

+ {

7845

+ 	struct bfq_entity *entity = &bfqq->entity;

7846

+@@ -2269,7 +2528,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

7847

+  */

7848

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7849

+ {

7850

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

7851

++	struct bfq_queue *bfqq;

7852

+ 	struct request *next_rq;

7853

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

7854

+

7855

+@@ -2279,17 +2538,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7856

+

7857

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

7858

+

7859

+-	/*

7860

+-         * If another queue has a request waiting within our mean seek

7861

+-         * distance, let it run. The expire code will check for close

7862

+-         * cooperators and put the close queue at the front of the

7863

+-         * service tree. If possible, merge the expiring queue with the

7864

+-         * new bfqq.

7865

+-         */

7866

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

7867

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

7868

+-                bfq_setup_merge(bfqq, new_bfqq);

7869

+-

7870

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

7871

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

7872

+ 	    !bfq_bfqq_must_idle(bfqq))

7873

+@@ -2328,10 +2576,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7874

+ 				bfq_clear_bfqq_wait_request(bfqq);

7875

+ 				del_timer(&bfqd->idle_slice_timer);

7876

+ 			}

7877

+-			if (new_bfqq == NULL)

7878

+-				goto keep_queue;

7879

+-			else

7880

+-				goto expire;

7881

++			goto keep_queue;

7882

+ 		}

7883

+ 	}

7884

+

7885

+@@ -2340,40 +2585,30 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7886

+ 	 * for a new request, or has requests waiting for a completion and

7887

+ 	 * may idle after their completion, then keep it anyway.

7888

+ 	 */

7889

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

7890

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

7891

++	if (timer_pending(&bfqd->idle_slice_timer) ||

7892

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

7893

+ 		bfqq = NULL;

7894

+ 		goto keep_queue;

7895

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

7896

+-		/*

7897

+-		 * Expiring the queue because there is a close cooperator,

7898

+-		 * cancel timer.

7899

+-		 */

7900

+-		bfq_clear_bfqq_wait_request(bfqq);

7901

+-		del_timer(&bfqd->idle_slice_timer);

7902

+ 	}

7903

+

7904

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

7905

+ expire:

7906

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

7907

+ new_queue:

7908

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

7909

++	bfqq = bfq_set_in_service_queue(bfqd);

7910

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

7911

+ 		bfqq != NULL ? bfqq->pid : 0);

7912

+ keep_queue:

7913

+ 	return bfqq;

7914

+ }

7915

+

7916

+-static void bfq_update_wr_data(struct bfq_data *bfqd,

7917

+-			       struct bfq_queue *bfqq)

7918

++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7919

+ {

7920

+-	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

7921

+-		struct bfq_entity *entity = &bfqq->entity;

7922

+-

7923

++	struct bfq_entity *entity = &bfqq->entity;

7924

++	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */

7925

+ 		bfq_log_bfqq(bfqd, bfqq,

7926

+ 			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

7927

+-			jiffies_to_msecs(jiffies -

7928

+-				bfqq->last_wr_start_finish),

7929

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

7930

+ 			jiffies_to_msecs(bfqq->wr_cur_max_time),

7931

+ 			bfqq->wr_coeff,

7932

+ 			bfqq->entity.weight, bfqq->entity.orig_weight);

7933

+@@ -2382,12 +2617,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7934

+ 		       entity->orig_weight * bfqq->wr_coeff);

7935

+ 		if (entity->ioprio_changed)

7936

+ 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

7937

++

7938

+ 		/*

7939

+ 		 * If the queue was activated in a burst, or

7940

+ 		 * too much time has elapsed from the beginning

7941

+-		 * of this weight-raising, then end weight raising.

7942

++		 * of this weight-raising period, or the queue has

7943

++		 * exceeded the acceptable number of cooperations,

7944

++		 * then end weight raising.

7945

+ 		 */

7946

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

7947

++		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

7948

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

7949

+ 					   bfqq->wr_cur_max_time)) {

7950

+ 			bfqq->last_wr_start_finish = jiffies;

7951

+@@ -2396,11 +2635,13 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7952

+ 				     bfqq->last_wr_start_finish,

7953

+ 				     jiffies_to_msecs(bfqq->wr_cur_max_time));

7954

+ 			bfq_bfqq_end_wr(bfqq);

7955

+-			__bfq_entity_update_weight_prio(

7956

+-				bfq_entity_service_tree(entity),

7957

+-				entity);

7958

+ 		}

7959

+ 	}

7960

++	/* Update weight both if it must be raised and if it must be lowered */

7961

++	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))

7962

++		__bfq_entity_update_weight_prio(

7963

++			bfq_entity_service_tree(entity),

7964

++			entity);

7965

+ }

7966

+

7967

+ /*

7968

+@@ -2647,6 +2888,25 @@ static inline void bfq_init_icq(struct io_cq *icq)

7969

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

7970

+

7971

+ 	bic->ttime.last_end_request = jiffies;

7972

++	/*

7973

++	 * A newly created bic indicates that the process has just

7974

++	 * started doing I/O, and is probably mapping into memory its

7975

++	 * executable and libraries: it definitely needs weight raising.

7976

++	 * There is however the possibility that the process performs,

7977

++	 * for a while, I/O close to some other process. EQM intercepts

7978

++	 * this behavior and may merge the queue corresponding to the

7979

++	 * process  with some other queue, BEFORE the weight of the queue

7980

++	 * is raised. Merged queues are not weight-raised (they are assumed

7981

++	 * to belong to processes that benefit only from high throughput).

7982

++	 * If the merge is basically the consequence of an accident, then

7983

++	 * the queue will be split soon and will get back its old weight.

7984

++	 * It is then important to write down somewhere that this queue

7985

++	 * does need weight raising, even if it did not make it to get its

7986

++	 * weight raised before being merged. To this purpose, we overload

7987

++	 * the field raising_time_left and assign 1 to it, to mark the queue

7988

++	 * as needing weight raising.

7989

++	 */

7990

++	bic->wr_time_left = 1;

7991

+ }

7992

+

7993

+ static void bfq_exit_icq(struct io_cq *icq)

7994

+@@ -2660,6 +2920,13 @@ static void bfq_exit_icq(struct io_cq *icq)

7995

+ 	}

7996

+

7997

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

7998

++		/*

7999

++		 * If the bic is using a shared queue, put the reference

8000

++		 * taken on the io_context when the bic started using a

8001

++		 * shared bfq_queue.

8002

++		 */

8003

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

8004

++			put_io_context(icq->ioc);

8005

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

8006

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

8007

+ 	}

8008

+@@ -2952,6 +3219,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

8009

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

8010

+ 		return;

8011

+

8012

++	/* Idle window just restored, statistics are meaningless. */

8013

++	if (bfq_bfqq_just_split(bfqq))

8014

++		return;

8015

++

8016

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

8017

+

8018

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

8019

+@@ -2999,6 +3270,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8020

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

8021

+ 	    !BFQQ_SEEKY(bfqq))

8022

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

8023

++	bfq_clear_bfqq_just_split(bfqq);

8024

+

8025

+ 	bfq_log_bfqq(bfqd, bfqq,

8026

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

8027

+@@ -3059,12 +3331,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8028

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

8029

+ {

8030

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

8031

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8032

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

8033

+

8034

+ 	assert_spin_locked(bfqd->queue->queue_lock);

8035

+

8036

++	/*

8037

++	 * An unplug may trigger a requeue of a request from the device

8038

++	 * driver: make sure we are in process context while trying to

8039

++	 * merge two bfq_queues.

8040

++	 */

8041

++	if (!in_interrupt()) {

8042

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

8043

++		if (new_bfqq != NULL) {

8044

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

8045

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

8046

++			/*

8047

++			 * Release the request's reference to the old bfqq

8048

++			 * and make sure one is taken to the shared queue.

8049

++			 */

8050

++			new_bfqq->allocated[rq_data_dir(rq)]++;

8051

++			bfqq->allocated[rq_data_dir(rq)]--;

8052

++			atomic_inc(&new_bfqq->ref);

8053

++			bfq_put_queue(bfqq);

8054

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

8055

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

8056

++						bfqq, new_bfqq);

8057

++			rq->elv.priv[1] = new_bfqq;

8058

++			bfqq = new_bfqq;

8059

++		} else

8060

++			bfq_bfqq_increase_failed_cooperations(bfqq);

8061

++	}

8062

++

8063

+ 	bfq_add_request(rq);

8064

+

8065

++	/*

8066

++	 * Here a newly-created bfq_queue has already started a weight-raising

8067

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

8068

++	 * from assigning it a full weight-raising period. See the detailed

8069

++	 * comments about this field in bfq_init_icq().

8070

++	 */

8071

++	if (bfqq->bic != NULL)

8072

++		bfqq->bic->wr_time_left = 0;

8073

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

8074

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

8075

+

8076

+@@ -3226,18 +3533,6 @@ static void bfq_put_request(struct request *rq)

8077

+ 	}

8078

+ }

8079

+

8080

+-static struct bfq_queue *

8081

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

8082

+-		struct bfq_queue *bfqq)

8083

+-{

8084

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

8085

+-		(long unsigned)bfqq->new_bfqq->pid);

8086

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

8087

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

8088

+-	bfq_put_queue(bfqq);

8089

+-	return bic_to_bfqq(bic, 1);

8090

+-}

8091

+-

8092

+ /*

8093

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

8094

+  * was the last process referring to said bfqq.

8095

+@@ -3246,6 +3541,9 @@ static struct bfq_queue *

8096

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

8097

+ {

8098

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

8099

++

8100

++	put_io_context(bic->icq.ioc);

8101

++

8102

+ 	if (bfqq_process_refs(bfqq) == 1) {

8103

+ 		bfqq->pid = current->pid;

8104

+ 		bfq_clear_bfqq_coop(bfqq);

8105

+@@ -3274,6 +3572,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8106

+ 	struct bfq_queue *bfqq;

8107

+ 	struct bfq_group *bfqg;

8108

+ 	unsigned long flags;

8109

++	bool split = false;

8110

+

8111

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

8112

+

8113

+@@ -3291,25 +3590,26 @@ new_queue:

8114

+ 	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

8115

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

8116

+ 		bic_set_bfqq(bic, bfqq, is_sync);

8117

++		if (split && is_sync) {

8118

++			if ((bic->was_in_burst_list && bfqd->large_burst) ||

8119

++			    bic->saved_in_large_burst)

8120

++				bfq_mark_bfqq_in_large_burst(bfqq);

8121

++			else {

8122

++			    bfq_clear_bfqq_in_large_burst(bfqq);

8123

++			    if (bic->was_in_burst_list)

8124

++			       hlist_add_head(&bfqq->burst_list_node,

8125

++				              &bfqd->burst_list);

8126

++			}

8127

++		}

8128

+ 	} else {

8129

+-		/*

8130

+-		 * If the queue was seeky for too long, break it apart.

8131

+-		 */

8132

++		/* If the queue was seeky for too long, break it apart. */

8133

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

8134

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

8135

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

8136

++			split = true;

8137

+ 			if (!bfqq)

8138

+ 				goto new_queue;

8139

+ 		}

8140

+-

8141

+-		/*

8142

+-		 * Check to see if this queue is scheduled to merge with

8143

+-		 * another closely cooperating queue. The merging of queues

8144

+-		 * happens here as it must be done in process context.

8145

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

8146

+-		 */

8147

+-		if (bfqq->new_bfqq != NULL)

8148

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

8149

+ 	}

8150

+

8151

+ 	bfqq->allocated[rw]++;

8152

+@@ -3320,6 +3620,26 @@ new_queue:

8153

+ 	rq->elv.priv[0] = bic;

8154

+ 	rq->elv.priv[1] = bfqq;

8155

+

8156

++	/*

8157

++	 * If a bfq_queue has only one process reference, it is owned

8158

++	 * by only one bfq_io_cq: we can set the bic field of the

8159

++	 * bfq_queue to the address of that structure. Also, if the

8160

++	 * queue has just been split, mark a flag so that the

8161

++	 * information is available to the other scheduler hooks.

8162

++	 */

8163

++	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

8164

++		bfqq->bic = bic;

8165

++		if (split) {

8166

++			bfq_mark_bfqq_just_split(bfqq);

8167

++			/*

8168

++			 * If the queue has just been split from a shared

8169

++			 * queue, restore the idle window and the possible

8170

++			 * weight raising period.

8171

++			 */

8172

++			bfq_bfqq_resume_state(bfqq, bic);

8173

++		}

8174

++	}

8175

++

8176

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

8177

+

8178

+ 	return 0;

8179

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

8180

+index c343099..d0890c6 100644

8181

+--- a/block/bfq-sched.c

8182

++++ b/block/bfq-sched.c

8183

+@@ -1085,34 +1085,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

8184

+ 	return bfqq;

8185

+ }

8186

+

8187

+-/*

8188

+- * Forced extraction of the given queue.

8189

+- */

8190

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

8191

+-				      struct bfq_queue *bfqq)

8192

+-{

8193

+-	struct bfq_entity *entity;

8194

+-	struct bfq_sched_data *sd;

8195

+-

8196

+-	BUG_ON(bfqd->in_service_queue != NULL);

8197

+-

8198

+-	entity = &bfqq->entity;

8199

+-	/*

8200

+-	 * Bubble up extraction/update from the leaf to the root.

8201

+-	*/

8202

+-	for_each_entity(entity) {

8203

+-		sd = entity->sched_data;

8204

+-		bfq_update_budget(entity);

8205

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

8206

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

8207

+-		sd->in_service_entity = entity;

8208

+-		sd->next_in_service = NULL;

8209

+-		entity->service = 0;

8210

+-	}

8211

+-

8212

+-	return;

8213

+-}

8214

+-

8215

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

8216

+ {

8217

+ 	if (bfqd->in_service_bic != NULL) {

8218

+diff --git a/block/bfq.h b/block/bfq.h

8219

+index e350b5f..93d3f6e 100644

8220

+--- a/block/bfq.h

8221

++++ b/block/bfq.h

8222

+@@ -218,18 +218,21 @@ struct bfq_group;

8223

+  *                      idle @bfq_queue with no outstanding requests, then

8224

+  *                      the task associated with the queue it is deemed as

8225

+  *                      soft real-time (see the comments to the function

8226

+- *                      bfq_bfqq_softrt_next_start()).

8227

++ *                      bfq_bfqq_softrt_next_start())

8228

+  * @last_idle_bklogged: time of the last transition of the @bfq_queue from

8229

+  *                      idle to backlogged

8230

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

8231

+  *                           since the last transition from idle to

8232

+  *                           backlogged

8233

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

8234

++ *	 queue is shared

8235

+  *

8236

+- * A bfq_queue is a leaf request queue; it can be associated with an io_context

8237

+- * or more, if it is async or shared between cooperating processes. @cgroup

8238

+- * holds a reference to the cgroup, to be sure that it does not disappear while

8239

+- * a bfqq still references it (mostly to avoid races between request issuing and

8240

+- * task migration followed by cgroup destruction).

8241

++ * A bfq_queue is a leaf request queue; it can be associated with an

8242

++ * io_context or more, if it  is  async or shared  between  cooperating

8243

++ * processes. @cgroup holds a reference to the cgroup, to be sure that it

8244

++ * does not disappear while a bfqq still references it (mostly to avoid

8245

++ * races between request issuing and task migration followed by cgroup

8246

++ * destruction).

8247

+  * All the fields are protected by the queue lock of the containing bfqd.

8248

+  */

8249

+ struct bfq_queue {

8250

+@@ -269,6 +272,7 @@ struct bfq_queue {

8251

+ 	unsigned int requests_within_timer;

8252

+

8253

+ 	pid_t pid;

8254

++	struct bfq_io_cq *bic;

8255

+

8256

+ 	/* weight-raising fields */

8257

+ 	unsigned long wr_cur_max_time;

8258

+@@ -298,12 +302,42 @@ struct bfq_ttime {

8259

+  * @icq: associated io_cq structure

8260

+  * @bfqq: array of two process queues, the sync and the async

8261

+  * @ttime: associated @bfq_ttime struct

8262

++ * @wr_time_left: snapshot of the time left before weight raising ends

8263

++ *                for the sync queue associated to this process; this

8264

++ *		  snapshot is taken to remember this value while the weight

8265

++ *		  raising is suspended because the queue is merged with a

8266

++ *		  shared queue, and is used to set @raising_cur_max_time

8267

++ *		  when the queue is split from the shared queue and its

8268

++ *		  weight is raised again

8269

++ * @saved_idle_window: same purpose as the previous field for the idle

8270

++ *                     window

8271

++ * @saved_IO_bound: same purpose as the previous two fields for the I/O

8272

++ *                  bound classification of a queue

8273

++ * @saved_in_large_burst: same purpose as the previous fields for the

8274

++ *                        value of the field keeping the queue's belonging

8275

++ *                        to a large burst

8276

++ * @was_in_burst_list: true if the queue belonged to a burst list

8277

++ *                     before its merge with another cooperating queue

8278

++ * @cooperations: counter of consecutive successful queue merges underwent

8279

++ *                by any of the process' @bfq_queues

8280

++ * @failed_cooperations: counter of consecutive failed queue merges of any

8281

++ *                       of the process' @bfq_queues

8282

+  */

8283

+ struct bfq_io_cq {

8284

+ 	struct io_cq icq; /* must be the first member */

8285

+ 	struct bfq_queue *bfqq[2];

8286

+ 	struct bfq_ttime ttime;

8287

+ 	int ioprio;

8288

++

8289

++	unsigned int wr_time_left;

8290

++	bool saved_idle_window;

8291

++	bool saved_IO_bound;

8292

++

8293

++	bool saved_in_large_burst;

8294

++	bool was_in_burst_list;

8295

++

8296

++	unsigned int cooperations;

8297

++	unsigned int failed_cooperations;

8298

+ };

8299

+

8300

+ enum bfq_device_speed {

8301

+@@ -536,7 +570,7 @@ enum bfqq_state_flags {

8302

+ 	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

8303

+ 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

8304

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

8305

+-	BFQ_BFQQ_FLAG_IO_bound,         /*

8306

++	BFQ_BFQQ_FLAG_IO_bound,		/*

8307

+ 					 * bfqq has timed-out at least once

8308

+ 					 * having consumed at most 2/10 of

8309

+ 					 * its budget

8310

+@@ -549,12 +583,13 @@ enum bfqq_state_flags {

8311

+ 					 * bfqq has proved to be slow and

8312

+ 					 * seeky until budget timeout

8313

+ 					 */

8314

+-	BFQ_BFQQ_FLAG_softrt_update,    /*

8315

++	BFQ_BFQQ_FLAG_softrt_update,	/*

8316

+ 					 * may need softrt-next-start

8317

+ 					 * update

8318

+ 					 */

8319

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

8320

+-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

8321

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

8322

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

8323

+ };

8324

+

8325

+ #define BFQ_BFQQ_FNS(name)						\

8326

+@@ -583,6 +618,7 @@ BFQ_BFQQ_FNS(in_large_burst);

8327

+ BFQ_BFQQ_FNS(constantly_seeky);

8328

+ BFQ_BFQQ_FNS(coop);

8329

+ BFQ_BFQQ_FNS(split_coop);

8330

++BFQ_BFQQ_FNS(just_split);

8331

+ BFQ_BFQQ_FNS(softrt_update);

8332

+ #undef BFQ_BFQQ_FNS

8333

+

8334

+--

8335

+1.9.1

8336

+

Gentoo Archives: gentoo-commits