[gentoo-commits] linux-patches r2724 - genpatches-2.6/trunk/3.14 - gentoo-commits

From:	"Mike Pagano (mpagano)" <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] linux-patches r2724 - genpatches-2.6/trunk/3.14
Date:	Tue, 01 Apr 2014 12:32:11
Message-Id:	`20140401123206.6C2B82005E@flycatcher.gentoo.org`

1

Author: mpagano

2

Date: 2014-04-01 12:32:06 +0000 (Tue, 01 Apr 2014)

3

New Revision: 2724

4

5

Added:

6

   genpatches-2.6/trunk/3.14/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r2-3.14.patch

7

   genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch

8

   genpatches-2.6/trunk/3.14/5003_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r2-for-3.14.0.patch

9

Modified:

10

   genpatches-2.6/trunk/3.14/0000_README

11

Log:

12

Add BFQ patchset v7r2

13

14

Modified: genpatches-2.6/trunk/3.14/0000_README

15

===================================================================

16

--- genpatches-2.6/trunk/3.14/0000_README	2014-03-31 19:23:02 UTC (rev 2723)

17

+++ genpatches-2.6/trunk/3.14/0000_README	2014-04-01 12:32:06 UTC (rev 2724)

18

@@ -77,3 +77,17 @@

19

 Patch:  4567_distro-Gentoo-Kconfig.patch

20

 From:   Tom Wijsman <TomWij@g.o>

21

 Desc:   Add Gentoo Linux support config settings and defaults.

22

+

23

+

24

+Patch:  5001_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r2-3.14.patch

25

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

26

+Desc:   BFQ v7r2 patch 1 for 3.14: Build, cgroups and kconfig bits

27

+

28

+Patch:  5002_BFQ-2-block-introduce-the-v7r2-I-O-sched-for-3.14.patch

29

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

+Desc:   BFQ v7r2 patch 2 for 3.14: BFQ Scheduler

31

+

32

+Patch:  5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r2-for-3.14.0.patch

33

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

34

+Desc:   BFQ v7r2 patch 3 for 3.14: Early Queue Merge (EQM)

35

+

36

37

Added: genpatches-2.6/trunk/3.14/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r2-3.14.patch

38

===================================================================

39

--- genpatches-2.6/trunk/3.14/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r2-3.14.patch	                        (rev 0)

40

+++ genpatches-2.6/trunk/3.14/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r2-3.14.patch	2014-04-01 12:32:06 UTC (rev 2724)

41

@@ -0,0 +1,104 @@

42

+From c3280db98437c9520f04ecacfdf1a868d7a4b7b3 Mon Sep 17 00:00:00 2001

43

+From: Paolo Valente <paolo.valente@×××××××.it>

44

+Date: Tue, 3 Sep 2013 16:50:42 +0200

45

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r2-3.14

46

+

47

+Update Kconfig.iosched and do the related Makefile changes to include

48

+kernel configuration options for BFQ. Also add the bfqio controller

49

+to the cgroups subsystem.

50

+

51

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

52

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

53

+---

54

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

55

+ block/Makefile                |  1 +

56

+ include/linux/cgroup_subsys.h |  4 ++++

57

+ 3 files changed, 37 insertions(+)

58

+

59

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

60

+index 421bef9..0ee5f0f 100644

61

+--- a/block/Kconfig.iosched

62

++++ b/block/Kconfig.iosched

63

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

64

+ 	---help---

65

+ 	  Enable group IO scheduling in CFQ.

66

+

67

++config IOSCHED_BFQ

68

++	tristate "BFQ I/O scheduler"

69

++	default n

70

++	---help---

71

++	  The BFQ I/O scheduler tries to distribute bandwidth among

72

++	  all processes according to their weights.

73

++	  It aims at distributing the bandwidth as desired, independently of

74

++	  the disk parameters and with any workload. It also tries to

75

++	  guarantee low latency to interactive and soft real-time

76

++	  applications. If compiled built-in (saying Y here), BFQ can

77

++	  be configured to support hierarchical scheduling.

78

++

79

++config CGROUP_BFQIO

80

++	bool "BFQ hierarchical scheduling support"

81

++	depends on CGROUPS && IOSCHED_BFQ=y

82

++	default n

83

++	---help---

84

++	  Enable hierarchical scheduling in BFQ, using the cgroups

85

++	  filesystem interface.  The name of the subsystem will be

86

++	  bfqio.

87

++

88

+ choice

89

+ 	prompt "Default I/O scheduler"

90

+ 	default DEFAULT_CFQ

91

+@@ -52,6 +73,16 @@ choice

92

+ 	config DEFAULT_CFQ

93

+ 		bool "CFQ" if IOSCHED_CFQ=y

94

+

95

++	config DEFAULT_BFQ

96

++		bool "BFQ" if IOSCHED_BFQ=y

97

++		help

98

++		  Selects BFQ as the default I/O scheduler which will be

99

++		  used by default for all block devices.

100

++		  The BFQ I/O scheduler aims at distributing the bandwidth

101

++		  as desired, independently of the disk parameters and with

102

++		  any workload. It also tries to guarantee low latency to

103

++		  interactive and soft real-time applications.

104

++

105

+ 	config DEFAULT_NOOP

106

+ 		bool "No-op"

107

+

108

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

109

+ 	string

110

+ 	default "deadline" if DEFAULT_DEADLINE

111

+ 	default "cfq" if DEFAULT_CFQ

112

++	default "bfq" if DEFAULT_BFQ

113

+ 	default "noop" if DEFAULT_NOOP

114

+

115

+ endmenu

116

+diff --git a/block/Makefile b/block/Makefile

117

+index 20645e8..cbd83fb 100644

118

+--- a/block/Makefile

119

++++ b/block/Makefile

120

+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

121

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

122

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

123

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

124

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

125

+

126

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

127

+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

128

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

129

+index 7b99d71..4e8c0ff 100644

130

+--- a/include/linux/cgroup_subsys.h

131

++++ b/include/linux/cgroup_subsys.h

132

+@@ -39,6 +39,10 @@ SUBSYS(net_cls)

133

+ SUBSYS(blkio)

134

+ #endif

135

+

136

++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

137

++SUBSYS(bfqio)

138

++#endif

139

++

140

+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

141

+ SUBSYS(perf)

142

+ #endif

143

+--

144

+1.9.0

145

+

146

147

Added: genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch

148

===================================================================

149

--- genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch	                        (rev 0)

150

+++ genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch	2014-04-01 12:32:06 UTC (rev 2724)

151

@@ -0,0 +1,6065 @@

152

+From 5055277df59d9280da6b60cf90bed8e5e57dc44d Mon Sep 17 00:00:00 2001

153

+From: Paolo Valente <paolo.valente@×××××××.it>

154

+Date: Thu, 9 May 2013 19:10:02 +0200

155

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r2 I/O sched for 3.14

156

+

157

+Add the BFQ-v7r2 I/O scheduler to 3.14.

158

+The general structure is borrowed from CFQ, as much of the code for

159

+handling I/O contexts. Over time, several useful features have been

160

+ported from CFQ as well (details in the changelog in README.BFQ). A

161

+(bfq_)queue is associated to each task doing I/O on a device, and each

162

+time a scheduling decision has to be made a queue is selected and served

163

+until it expires.

164

+

165

+    - Slices are given in the service domain: tasks are assigned

166

+      budgets, measured in number of sectors. Once got the disk, a task

167

+      must however consume its assigned budget within a configurable

168

+      maximum time (by default, the maximum possible value of the

169

+      budgets is automatically computed to comply with this timeout).

170

+      This allows the desired latency vs "throughput boosting" tradeoff

171

+      to be set.

172

+

173

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

174

+      using an augmented rb-tree to take eligibility into account while

175

+      preserving an O(log N) overall complexity.

176

+

177

+    - A low-latency tunable is provided; if enabled, both interactive

178

+      and soft real-time applications are guaranteed a very low latency.

179

+

180

+    - Latency guarantees are preserved also in the presence of NCQ.

181

+

182

+    - Also with flash-based devices, a high throughput is achieved

183

+      while still preserving latency guarantees.

184

+

185

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

186

+      cooperating-queue-merging and the preemption mechanisms present

187

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

188

+      sequential read pattern, and hence a high throughput, with any

189

+      set of processes performing interleaved I/O over a contiguous

190

+      sequence of sectors.

191

+

192

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

193

+      interface.  Since each node has a full scheduler, each group can

194

+      be assigned its own weight.

195

+

196

+    - If the cgroups interface is not used, only I/O priorities can be

197

+      assigned to processes, with ioprio values mapped to weights

198

+      with the relation weight = IOPRIO_BE_NR - ioprio.

199

+

200

+    - ioprio classes are served in strict priority order, i.e., lower

201

+      priority queues are not served as long as there are higher

202

+      priority queues.  Among queues in the same class the bandwidth is

203

+      distributed in proportion to the weight of each queue. A very

204

+      thin extra bandwidth is however guaranteed to the Idle class, to

205

+      prevent it from starving.

206

+

207

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

208

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

209

+---

210

+ block/bfq-cgroup.c  |  926 +++++++++++++++

211

+ block/bfq-ioc.c     |   36 +

212

+ block/bfq-iosched.c | 3300 +++++++++++++++++++++++++++++++++++++++++++++++++++

213

+ block/bfq-sched.c   | 1078 +++++++++++++++++

214

+ block/bfq.h         |  622 ++++++++++

215

+ 5 files changed, 5962 insertions(+)

216

+ create mode 100644 block/bfq-cgroup.c

217

+ create mode 100644 block/bfq-ioc.c

218

+ create mode 100644 block/bfq-iosched.c

219

+ create mode 100644 block/bfq-sched.c

220

+ create mode 100644 block/bfq.h

221

+

222

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

223

+new file mode 100644

224

+index 0000000..bcecdb4

225

+--- /dev/null

226

++++ b/block/bfq-cgroup.c

227

+@@ -0,0 +1,926 @@

228

++/*

229

++ * BFQ: CGROUPS support.

230

++ *

231

++ * Based on ideas and code from CFQ:

232

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

233

++ *

234

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

235

++ *		      Paolo Valente <paolo.valente@×××××××.it>

236

++ *

237

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

238

++ *

239

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

240

++ */

241

++

242

++#ifdef CONFIG_CGROUP_BFQIO

243

++

244

++static DEFINE_MUTEX(bfqio_mutex);

245

++

246

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

247

++{

248

++	return bgrp ? !bgrp->online : false;

249

++}

250

++

251

++static struct bfqio_cgroup bfqio_root_cgroup = {

252

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

253

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

254

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

255

++};

256

++

257

++static inline void bfq_init_entity(struct bfq_entity *entity,

258

++				   struct bfq_group *bfqg)

259

++{

260

++	entity->weight = entity->new_weight;

261

++	entity->orig_weight = entity->new_weight;

262

++	entity->ioprio = entity->new_ioprio;

263

++	entity->ioprio_class = entity->new_ioprio_class;

264

++	entity->parent = bfqg->my_entity;

265

++	entity->sched_data = &bfqg->sched_data;

266

++}

267

++

268

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

269

++{

270

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

271

++}

272

++

273

++/*

274

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

275

++ * of bgrp.  Must be called under rcu_read_lock().

276

++ */

277

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

278

++					    struct bfq_data *bfqd)

279

++{

280

++	struct bfq_group *bfqg;

281

++	void *key;

282

++

283

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

284

++		key = rcu_dereference(bfqg->bfqd);

285

++		if (key == bfqd)

286

++			return bfqg;

287

++	}

288

++

289

++	return NULL;

290

++}

291

++

292

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

293

++					 struct bfq_group *bfqg)

294

++{

295

++	struct bfq_entity *entity = &bfqg->entity;

296

++

297

++	/*

298

++	 * If the weight of the entity has never been set via the sysfs

299

++	 * interface, then bgrp->weight == 0. In this case we initialize

300

++	 * the weight from the current ioprio value. Otherwise, the group

301

++	 * weight, if set, has priority over the ioprio value.

302

++	 */

303

++	if (bgrp->weight == 0) {

304

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

305

++		entity->new_ioprio = bgrp->ioprio;

306

++	} else {

307

++		entity->new_weight = bgrp->weight;

308

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

309

++	}

310

++	entity->orig_weight = entity->weight = entity->new_weight;

311

++	entity->ioprio = entity->new_ioprio;

312

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

313

++	entity->my_sched_data = &bfqg->sched_data;

314

++}

315

++

316

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

317

++					struct bfq_group *parent)

318

++{

319

++	struct bfq_entity *entity;

320

++

321

++	BUG_ON(parent == NULL);

322

++	BUG_ON(bfqg == NULL);

323

++

324

++	entity = &bfqg->entity;

325

++	entity->parent = parent->my_entity;

326

++	entity->sched_data = &parent->sched_data;

327

++}

328

++

329

++/**

330

++ * bfq_group_chain_alloc - allocate a chain of groups.

331

++ * @bfqd: queue descriptor.

332

++ * @css: the leaf cgroup_subsys_state this chain starts from.

333

++ *

334

++ * Allocate a chain of groups starting from the one belonging to

335

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

336

++ * to the root has already an allocated group on @bfqd.

337

++ */

338

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

339

++					       struct cgroup_subsys_state *css)

340

++{

341

++	struct bfqio_cgroup *bgrp;

342

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

343

++

344

++	for (; css != NULL; css = css->parent) {

345

++		bgrp = css_to_bfqio(css);

346

++

347

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

348

++		if (bfqg != NULL) {

349

++			/*

350

++			 * All the cgroups in the path from there to the

351

++			 * root must have a bfq_group for bfqd, so we don't

352

++			 * need any more allocations.

353

++			 */

354

++			break;

355

++		}

356

++

357

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

358

++		if (bfqg == NULL)

359

++			goto cleanup;

360

++

361

++		bfq_group_init_entity(bgrp, bfqg);

362

++		bfqg->my_entity = &bfqg->entity;

363

++

364

++		if (leaf == NULL) {

365

++			leaf = bfqg;

366

++			prev = leaf;

367

++		} else {

368

++			bfq_group_set_parent(prev, bfqg);

369

++			/*

370

++			 * Build a list of allocated nodes using the bfqd

371

++			 * filed, that is still unused and will be initialized

372

++			 * only after the node will be connected.

373

++			 */

374

++			prev->bfqd = bfqg;

375

++			prev = bfqg;

376

++		}

377

++	}

378

++

379

++	return leaf;

380

++

381

++cleanup:

382

++	while (leaf != NULL) {

383

++		prev = leaf;

384

++		leaf = leaf->bfqd;

385

++		kfree(prev);

386

++	}

387

++

388

++	return NULL;

389

++}

390

++

391

++/**

392

++ * bfq_group_chain_link - link an allocated group chain to a cgroup hierarchy.

393

++ * @bfqd: the queue descriptor.

394

++ * @css: the leaf cgroup_subsys_state to start from.

395

++ * @leaf: the leaf group (to be associated to @cgroup).

396

++ *

397

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

398

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

399

++ * hierarchy that already as a group associated to @bfqd all the nodes

400

++ * in the path to the root cgroup have one too.

401

++ *

402

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

403

++ * per device) while the bfqio_cgroup lock protects the list of groups

404

++ * belonging to the same cgroup.

405

++ */

406

++static void bfq_group_chain_link(struct bfq_data *bfqd,

407

++				 struct cgroup_subsys_state *css,

408

++				 struct bfq_group *leaf)

409

++{

410

++	struct bfqio_cgroup *bgrp;

411

++	struct bfq_group *bfqg, *next, *prev = NULL;

412

++	unsigned long flags;

413

++

414

++	assert_spin_locked(bfqd->queue->queue_lock);

415

++

416

++	for (; css != NULL && leaf != NULL; css = css->parent) {

417

++		bgrp = css_to_bfqio(css);

418

++		next = leaf->bfqd;

419

++

420

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

421

++		BUG_ON(bfqg != NULL);

422

++

423

++		spin_lock_irqsave(&bgrp->lock, flags);

424

++

425

++		rcu_assign_pointer(leaf->bfqd, bfqd);

426

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

427

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

428

++

429

++		spin_unlock_irqrestore(&bgrp->lock, flags);

430

++

431

++		prev = leaf;

432

++		leaf = next;

433

++	}

434

++

435

++	BUG_ON(css == NULL && leaf != NULL);

436

++	if (css != NULL && prev != NULL) {

437

++		bgrp = css_to_bfqio(css);

438

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

439

++		bfq_group_set_parent(prev, bfqg);

440

++	}

441

++}

442

++

443

++/**

444

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

445

++ * @bfqd: queue descriptor.

446

++ * @cgroup: cgroup being searched for.

447

++ *

448

++ * Return a group associated to @bfqd in @cgroup, allocating one if

449

++ * necessary.  When a group is returned all the cgroups in the path

450

++ * to the root have a group associated to @bfqd.

451

++ *

452

++ * If the allocation fails, return the root group: this breaks guarantees

453

++ * but is a safe fallback.  If this loss becomes a problem it can be

454

++ * mitigated using the equivalent weight (given by the product of the

455

++ * weights of the groups in the path from @group to the root) in the

456

++ * root scheduler.

457

++ *

458

++ * We allocate all the missing nodes in the path from the leaf cgroup

459

++ * to the root and we connect the nodes only after all the allocations

460

++ * have been successful.

461

++ */

462

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

463

++					      struct cgroup_subsys_state *css)

464

++{

465

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

466

++	struct bfq_group *bfqg;

467

++

468

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

469

++	if (bfqg != NULL)

470

++		return bfqg;

471

++

472

++	bfqg = bfq_group_chain_alloc(bfqd, css);

473

++	if (bfqg != NULL)

474

++		bfq_group_chain_link(bfqd, css, bfqg);

475

++	else

476

++		bfqg = bfqd->root_group;

477

++

478

++	return bfqg;

479

++}

480

++

481

++/**

482

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

483

++ * @bfqd: queue descriptor.

484

++ * @bfqq: the queue to move.

485

++ * @entity: @bfqq's entity.

486

++ * @bfqg: the group to move to.

487

++ *

488

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

489

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

490

++ *

491

++ * Must be called under the queue lock; the cgroup owning @bfqg must

492

++ * not disappear (by now this just means that we are called under

493

++ * rcu_read_lock()).

494

++ */

495

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

496

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

497

++{

498

++	int busy, resume;

499

++

500

++	busy = bfq_bfqq_busy(bfqq);

501

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

502

++

503

++	BUG_ON(resume && !entity->on_st);

504

++	BUG_ON(busy && !resume && entity->on_st &&

505

++	       bfqq != bfqd->in_service_queue);

506

++

507

++	if (busy) {

508

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

509

++

510

++		if (!resume)

511

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

512

++		else

513

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

514

++	} else if (entity->on_st)

515

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

516

++

517

++	/*

518

++	 * Here we use a reference to bfqg.  We don't need a refcounter

519

++	 * as the cgroup reference will not be dropped, so that its

520

++	 * destroy() callback will not be invoked.

521

++	 */

522

++	entity->parent = bfqg->my_entity;

523

++	entity->sched_data = &bfqg->sched_data;

524

++

525

++	if (busy && resume)

526

++		bfq_activate_bfqq(bfqd, bfqq);

527

++

528

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

529

++		bfq_schedule_dispatch(bfqd);

530

++}

531

++

532

++/**

533

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

534

++ * @bfqd: the queue descriptor.

535

++ * @bic: the bic to move.

536

++ * @cgroup: the cgroup to move to.

537

++ *

538

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

539

++ * has to make sure that the reference to cgroup is valid across the call.

540

++ *

541

++ * NOTE: an alternative approach might have been to store the current

542

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

543

++ * time here, at the price of slightly more complex code.

544

++ */

545

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

546

++						struct bfq_io_cq *bic,

547

++						struct cgroup_subsys_state *css)

548

++{

549

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

550

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

551

++	struct bfq_entity *entity;

552

++	struct bfq_group *bfqg;

553

++	struct bfqio_cgroup *bgrp;

554

++

555

++	bgrp = css_to_bfqio(css);

556

++

557

++	bfqg = bfq_find_alloc_group(bfqd, css);

558

++	if (async_bfqq != NULL) {

559

++		entity = &async_bfqq->entity;

560

++

561

++		if (entity->sched_data != &bfqg->sched_data) {

562

++			bic_set_bfqq(bic, NULL, 0);

563

++			bfq_log_bfqq(bfqd, async_bfqq,

564

++				     "bic_change_group: %p %d",

565

++				     async_bfqq, atomic_read(&async_bfqq->ref));

566

++			bfq_put_queue(async_bfqq);

567

++		}

568

++	}

569

++

570

++	if (sync_bfqq != NULL) {

571

++		entity = &sync_bfqq->entity;

572

++		if (entity->sched_data != &bfqg->sched_data)

573

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

574

++	}

575

++

576

++	return bfqg;

577

++}

578

++

579

++/**

580

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

581

++ * @bic: the bic being migrated.

582

++ * @cgroup: the destination cgroup.

583

++ *

584

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

585

++ * moved into its new parent group.

586

++ */

587

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

588

++				  struct cgroup_subsys_state *css)

589

++{

590

++	struct bfq_data *bfqd;

591

++	unsigned long uninitialized_var(flags);

592

++

593

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

594

++				   &flags);

595

++	if (bfqd != NULL) {

596

++		__bfq_bic_change_cgroup(bfqd, bic, css);

597

++		bfq_put_bfqd_unlock(bfqd, &flags);

598

++	}

599

++}

600

++

601

++/**

602

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

603

++ * @bic: the @bic to update.

604

++ *

605

++ * Make sure that @bic is enqueued in the cgroup of the current task.

606

++ * We need this in addition to moving bics during the cgroup attach

607

++ * phase because the task owning @bic could be at its first disk

608

++ * access or we may end up in the root cgroup as the result of a

609

++ * memory allocation failure and here we try to move to the right

610

++ * group.

611

++ *

612

++ * Must be called under the queue lock.  It is safe to use the returned

613

++ * value even after the rcu_read_unlock() as the migration/destruction

614

++ * paths act under the queue lock too.  IOW it is impossible to race with

615

++ * group migration/destruction and end up with an invalid group as:

616

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

617

++ *      has started execution, as current holds a reference to it,

618

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

619

++ *      migrated to a different cgroup] its attach() callback will have

620

++ *      taken care of remove all the references to the old cgroup data.

621

++ */

622

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

623

++{

624

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

625

++	struct bfq_group *bfqg;

626

++	struct cgroup_subsys_state *css;

627

++

628

++	BUG_ON(bfqd == NULL);

629

++

630

++	rcu_read_lock();

631

++	css = task_css(current, bfqio_subsys_id);

632

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

633

++	rcu_read_unlock();

634

++

635

++	return bfqg;

636

++}

637

++

638

++/**

639

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

640

++ * @st: the service tree being flushed.

641

++ */

642

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

643

++{

644

++	struct bfq_entity *entity = st->first_idle;

645

++

646

++	for (; entity != NULL; entity = st->first_idle)

647

++		__bfq_deactivate_entity(entity, 0);

648

++}

649

++

650

++/**

651

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

652

++ * @bfqd: the device data structure with the root group.

653

++ * @entity: the entity to move.

654

++ */

655

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

656

++					    struct bfq_entity *entity)

657

++{

658

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

659

++

660

++	BUG_ON(bfqq == NULL);

661

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

662

++	return;

663

++}

664

++

665

++/**

666

++ * bfq_reparent_active_entities - move to the root group all active entities.

667

++ * @bfqd: the device data structure with the root group.

668

++ * @bfqg: the group to move from.

669

++ * @st: the service tree with the entities.

670

++ *

671

++ * Needs queue_lock to be taken and reference to be valid over the call.

672

++ */

673

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

674

++						struct bfq_group *bfqg,

675

++						struct bfq_service_tree *st)

676

++{

677

++	struct rb_root *active = &st->active;

678

++	struct bfq_entity *entity = NULL;

679

++

680

++	if (!RB_EMPTY_ROOT(&st->active))

681

++		entity = bfq_entity_of(rb_first(active));

682

++

683

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

684

++		bfq_reparent_leaf_entity(bfqd, entity);

685

++

686

++	if (bfqg->sched_data.in_service_entity != NULL)

687

++		bfq_reparent_leaf_entity(bfqd,

688

++			bfqg->sched_data.in_service_entity);

689

++

690

++	return;

691

++}

692

++

693

++/**

694

++ * bfq_destroy_group - destroy @bfqg.

695

++ * @bgrp: the bfqio_cgroup containing @bfqg.

696

++ * @bfqg: the group being destroyed.

697

++ *

698

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

699

++ */

700

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

701

++{

702

++	struct bfq_data *bfqd;

703

++	struct bfq_service_tree *st;

704

++	struct bfq_entity *entity = bfqg->my_entity;

705

++	unsigned long uninitialized_var(flags);

706

++	int i;

707

++

708

++	hlist_del(&bfqg->group_node);

709

++

710

++	/*

711

++	 * Empty all service_trees belonging to this group before deactivating

712

++	 * the group itself.

713

++	 */

714

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

715

++		st = bfqg->sched_data.service_tree + i;

716

++

717

++		/*

718

++		 * The idle tree may still contain bfq_queues belonging

719

++		 * to exited task because they never migrated to a different

720

++		 * cgroup from the one being destroyed now.  No one else

721

++		 * can access them so it's safe to act without any lock.

722

++		 */

723

++		bfq_flush_idle_tree(st);

724

++

725

++		/*

726

++		 * It may happen that some queues are still active

727

++		 * (busy) upon group destruction (if the corresponding

728

++		 * processes have been forced to terminate). We move

729

++		 * all the leaf entities corresponding to these queues

730

++		 * to the root_group.

731

++		 * Also, it may happen that the group has an entity

732

++		 * under service, which is disconnected from the active

733

++		 * tree: it must be moved, too.

734

++		 * There is no need to put the sync queues, as the

735

++		 * scheduler has taken no reference.

736

++		 */

737

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

738

++		if (bfqd != NULL) {

739

++			bfq_reparent_active_entities(bfqd, bfqg, st);

740

++			bfq_put_bfqd_unlock(bfqd, &flags);

741

++		}

742

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

743

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

744

++	}

745

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

746

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

747

++

748

++	/*

749

++	 * We may race with device destruction, take extra care when

750

++	 * dereferencing bfqg->bfqd.

751

++	 */

752

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

753

++	if (bfqd != NULL) {

754

++		hlist_del(&bfqg->bfqd_node);

755

++		__bfq_deactivate_entity(entity, 0);

756

++		bfq_put_async_queues(bfqd, bfqg);

757

++		bfq_put_bfqd_unlock(bfqd, &flags);

758

++	}

759

++	BUG_ON(entity->tree != NULL);

760

++

761

++	/*

762

++	 * No need to defer the kfree() to the end of the RCU grace

763

++	 * period: we are called from the destroy() callback of our

764

++	 * cgroup, so we can be sure that no one is a) still using

765

++	 * this cgroup or b) doing lookups in it.

766

++	 */

767

++	kfree(bfqg);

768

++}

769

++

770

++static void bfq_end_raising_async(struct bfq_data *bfqd)

771

++{

772

++	struct hlist_node *tmp;

773

++	struct bfq_group *bfqg;

774

++

775

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

776

++		bfq_end_raising_async_queues(bfqd, bfqg);

777

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

778

++}

779

++

780

++/**

781

++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.

782

++ * @bfqd: the device descriptor being exited.

783

++ *

784

++ * When the device exits we just make sure that no lookup can return

785

++ * the now unused group structures.  They will be deallocated on cgroup

786

++ * destruction.

787

++ */

788

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

789

++{

790

++	struct hlist_node *tmp;

791

++	struct bfq_group *bfqg;

792

++

793

++	bfq_log(bfqd, "disconnect_groups beginning");

794

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

795

++		hlist_del(&bfqg->bfqd_node);

796

++

797

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

798

++

799

++		/*

800

++		 * Don't remove from the group hash, just set an

801

++		 * invalid key.  No lookups can race with the

802

++		 * assignment as bfqd is being destroyed; this

803

++		 * implies also that new elements cannot be added

804

++		 * to the list.

805

++		 */

806

++		rcu_assign_pointer(bfqg->bfqd, NULL);

807

++

808

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

809

++			bfqg);

810

++		bfq_put_async_queues(bfqd, bfqg);

811

++	}

812

++}

813

++

814

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

815

++{

816

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

817

++	struct bfq_group *bfqg = bfqd->root_group;

818

++

819

++	bfq_put_async_queues(bfqd, bfqg);

820

++

821

++	spin_lock_irq(&bgrp->lock);

822

++	hlist_del_rcu(&bfqg->group_node);

823

++	spin_unlock_irq(&bgrp->lock);

824

++

825

++	/*

826

++	 * No need to synchronize_rcu() here: since the device is gone

827

++	 * there cannot be any read-side access to its root_group.

828

++	 */

829

++	kfree(bfqg);

830

++}

831

++

832

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

833

++{

834

++	struct bfq_group *bfqg;

835

++	struct bfqio_cgroup *bgrp;

836

++	int i;

837

++

838

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

839

++	if (bfqg == NULL)

840

++		return NULL;

841

++

842

++	bfqg->entity.parent = NULL;

843

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

844

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

845

++

846

++	bgrp = &bfqio_root_cgroup;

847

++	spin_lock_irq(&bgrp->lock);

848

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

849

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

850

++	spin_unlock_irq(&bgrp->lock);

851

++

852

++	return bfqg;

853

++}

854

++

855

++#define SHOW_FUNCTION(__VAR)						\

856

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

857

++				       struct cftype *cftype)		\

858

++{									\

859

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

860

++	u64 ret = -ENODEV;						\

861

++									\

862

++	mutex_lock(&bfqio_mutex);					\

863

++	if (bfqio_is_removed(bgrp))					\

864

++		goto out_unlock;					\

865

++									\

866

++	spin_lock_irq(&bgrp->lock);					\

867

++	ret = bgrp->__VAR;						\

868

++	spin_unlock_irq(&bgrp->lock);					\

869

++									\

870

++out_unlock:								\

871

++	mutex_unlock(&bfqio_mutex);					\

872

++	return ret;							\

873

++}

874

++

875

++SHOW_FUNCTION(weight);

876

++SHOW_FUNCTION(ioprio);

877

++SHOW_FUNCTION(ioprio_class);

878

++#undef SHOW_FUNCTION

879

++

880

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

881

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

882

++					struct cftype *cftype,		\

883

++					u64 val)			\

884

++{									\

885

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

886

++	struct bfq_group *bfqg;						\

887

++	int ret = -EINVAL;						\

888

++									\

889

++	if (val < (__MIN) || val > (__MAX))				\

890

++		return ret;						\

891

++									\

892

++	ret = -ENODEV;							\

893

++	mutex_lock(&bfqio_mutex);					\

894

++	if (bfqio_is_removed(bgrp))					\

895

++		goto out_unlock;					\

896

++	ret = 0;							\

897

++									\

898

++	spin_lock_irq(&bgrp->lock);					\

899

++	bgrp->__VAR = (unsigned short)val;				\

900

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

901

++		/*							\

902

++		 * Setting the ioprio_changed flag of the entity        \

903

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

904

++		 * the value of the weight to its ioprio mapping.       \

905

++		 * Set the flag only if necessary.			\

906

++		 */							\

907

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

908

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

909

++			/*						\

910

++			 * Make sure that the above new value has been	\

911

++			 * stored in bfqg->entity.new_##__VAR before	\

912

++			 * setting the ioprio_changed flag. In fact,	\

913

++			 * this flag may be read asynchronously (in	\

914

++			 * critical sections protected by a different	\

915

++			 * lock than that held here), and finding this	\

916

++			 * flag set may cause the execution of the code	\

917

++			 * for updating parameters whose value may	\

918

++			 * depend also on bfqg->entity.new_##__VAR (in	\

919

++			 * __bfq_entity_update_weight_prio).		\

920

++			 * This barrier makes sure that the new value	\

921

++			 * of bfqg->entity.new_##__VAR is correctly	\

922

++			 * seen in that code.				\

923

++			 */						\

924

++			smp_wmb();                                      \

925

++			bfqg->entity.ioprio_changed = 1;                \

926

++		}							\

927

++	}								\

928

++	spin_unlock_irq(&bgrp->lock);					\

929

++									\

930

++out_unlock:								\

931

++	mutex_unlock(&bfqio_mutex);					\

932

++	return ret;							\

933

++}

934

++

935

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

936

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

937

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

938

++#undef STORE_FUNCTION

939

++

940

++static struct cftype bfqio_files[] = {

941

++	{

942

++		.name = "weight",

943

++		.read_u64 = bfqio_cgroup_weight_read,

944

++		.write_u64 = bfqio_cgroup_weight_write,

945

++	},

946

++	{

947

++		.name = "ioprio",

948

++		.read_u64 = bfqio_cgroup_ioprio_read,

949

++		.write_u64 = bfqio_cgroup_ioprio_write,

950

++	},

951

++	{

952

++		.name = "ioprio_class",

953

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

954

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

955

++	},

956

++	{ },	/* terminate */

957

++};

958

++

959

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

960

++						*parent_css)

961

++{

962

++	struct bfqio_cgroup *bgrp;

963

++

964

++	if (parent_css != NULL) {

965

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

966

++		if (bgrp == NULL)

967

++			return ERR_PTR(-ENOMEM);

968

++	} else

969

++		bgrp = &bfqio_root_cgroup;

970

++

971

++	spin_lock_init(&bgrp->lock);

972

++	INIT_HLIST_HEAD(&bgrp->group_data);

973

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

974

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

975

++

976

++	return &bgrp->css;

977

++}

978

++

979

++/*

980

++ * We cannot support shared io contexts, as we have no means to support

981

++ * two tasks with the same ioc in two different groups without major rework

982

++ * of the main bic/bfqq data structures.  By now we allow a task to change

983

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

984

++ * behavior is that a group containing a task that forked using CLONE_IO

985

++ * will not be destroyed until the tasks sharing the ioc die.

986

++ */

987

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

988

++			    struct cgroup_taskset *tset)

989

++{

990

++	struct task_struct *task;

991

++	struct io_context *ioc;

992

++	int ret = 0;

993

++

994

++	cgroup_taskset_for_each(task, css, tset) {

995

++		/*

996

++		 * task_lock() is needed to avoid races with

997

++		 * exit_io_context()

998

++		 */

999

++		task_lock(task);

1000

++		ioc = task->io_context;

1001

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1002

++			/*

1003

++			 * ioc == NULL means that the task is either too young

1004

++			 * or exiting: if it has still no ioc the ioc can't be

1005

++			 * shared, if the task is exiting the attach will fail

1006

++			 * anyway, no matter what we return here.

1007

++			 */

1008

++			ret = -EINVAL;

1009

++		task_unlock(task);

1010

++		if (ret)

1011

++			break;

1012

++	}

1013

++

1014

++	return ret;

1015

++}

1016

++

1017

++static void bfqio_attach(struct cgroup_subsys_state *css,

1018

++			 struct cgroup_taskset *tset)

1019

++{

1020

++	struct task_struct *task;

1021

++	struct io_context *ioc;

1022

++	struct io_cq *icq;

1023

++

1024

++	/*

1025

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

1026

++	 * new group has not yet been tested.

1027

++	 */

1028

++	cgroup_taskset_for_each(task, css, tset) {

1029

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1030

++		if (ioc) {

1031

++			/*

1032

++			 * Handle cgroup change here.

1033

++			 */

1034

++			rcu_read_lock();

1035

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1036

++				if (!strncmp(

1037

++					icq->q->elevator->type->elevator_name,

1038

++					"bfq", ELV_NAME_MAX))

1039

++					bfq_bic_change_cgroup(icq_to_bic(icq),

1040

++							      css);

1041

++			rcu_read_unlock();

1042

++			put_io_context(ioc);

1043

++		}

1044

++	}

1045

++}

1046

++

1047

++static void bfqio_destroy(struct cgroup_subsys_state *css)

1048

++{

1049

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1050

++	struct hlist_node *tmp;

1051

++	struct bfq_group *bfqg;

1052

++

1053

++	/*

1054

++	 * Since we are destroying the cgroup, there are no more tasks

1055

++	 * referencing it, and all the RCU grace periods that may have

1056

++	 * referenced it are ended (as the destruction of the parent

1057

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1058

++	 * anything else and we don't need any synchronization.

1059

++	 */

1060

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1061

++		bfq_destroy_group(bgrp, bfqg);

1062

++

1063

++	BUG_ON(!hlist_empty(&bgrp->group_data));

1064

++

1065

++	kfree(bgrp);

1066

++}

1067

++

1068

++static int bfqio_css_online(struct cgroup_subsys_state *css)

1069

++{

1070

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1071

++

1072

++	mutex_lock(&bfqio_mutex);

1073

++	bgrp->online = true;

1074

++	mutex_unlock(&bfqio_mutex);

1075

++

1076

++	return 0;

1077

++}

1078

++

1079

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

1080

++{

1081

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1082

++

1083

++	mutex_lock(&bfqio_mutex);

1084

++	bgrp->online = false;

1085

++	mutex_unlock(&bfqio_mutex);

1086

++}

1087

++

1088

++struct cgroup_subsys bfqio_subsys = {

1089

++	.name = "bfqio",

1090

++	.css_alloc = bfqio_create,

1091

++	.css_online = bfqio_css_online,

1092

++	.css_offline = bfqio_css_offline,

1093

++	.can_attach = bfqio_can_attach,

1094

++	.attach = bfqio_attach,

1095

++	.css_free = bfqio_destroy,

1096

++	.subsys_id = bfqio_subsys_id,

1097

++	.base_cftypes = bfqio_files,

1098

++};

1099

++#else

1100

++static inline void bfq_init_entity(struct bfq_entity *entity,

1101

++				   struct bfq_group *bfqg)

1102

++{

1103

++	entity->weight = entity->new_weight;

1104

++	entity->orig_weight = entity->new_weight;

1105

++	entity->ioprio = entity->new_ioprio;

1106

++	entity->ioprio_class = entity->new_ioprio_class;

1107

++	entity->sched_data = &bfqg->sched_data;

1108

++}

1109

++

1110

++static inline struct bfq_group *

1111

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1112

++{

1113

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1114

++	return bfqd->root_group;

1115

++}

1116

++

1117

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1118

++				 struct bfq_queue *bfqq,

1119

++				 struct bfq_entity *entity,

1120

++				 struct bfq_group *bfqg)

1121

++{

1122

++}

1123

++

1124

++static void bfq_end_raising_async(struct bfq_data *bfqd)

1125

++{

1126

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

1127

++}

1128

++

1129

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1130

++{

1131

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1132

++}

1133

++

1134

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

1135

++{

1136

++	kfree(bfqd->root_group);

1137

++}

1138

++

1139

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1140

++{

1141

++	struct bfq_group *bfqg;

1142

++	int i;

1143

++

1144

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1145

++	if (bfqg == NULL)

1146

++		return NULL;

1147

++

1148

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1149

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1150

++

1151

++	return bfqg;

1152

++}

1153

++#endif

1154

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1155

+new file mode 100644

1156

+index 0000000..7f6b000

1157

+--- /dev/null

1158

++++ b/block/bfq-ioc.c

1159

+@@ -0,0 +1,36 @@

1160

++/*

1161

++ * BFQ: I/O context handling.

1162

++ *

1163

++ * Based on ideas and code from CFQ:

1164

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1165

++ *

1166

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1167

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1168

++ *

1169

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1170

++ */

1171

++

1172

++/**

1173

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1174

++ * @icq: the iocontext queue.

1175

++ */

1176

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1177

++{

1178

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1179

++	return container_of(icq, struct bfq_io_cq, icq);

1180

++}

1181

++

1182

++/**

1183

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1184

++ * @bfqd: the lookup key.

1185

++ * @ioc: the io_context of the process doing I/O.

1186

++ *

1187

++ * Queue lock must be held.

1188

++ */

1189

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1190

++					       struct io_context *ioc)

1191

++{

1192

++	if (ioc)

1193

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1194

++	return NULL;

1195

++}

1196

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1197

+new file mode 100644

1198

+index 0000000..f5f71e4

1199

+--- /dev/null

1200

++++ b/block/bfq-iosched.c

1201

+@@ -0,0 +1,3300 @@

1202

++/*

1203

++ * Budget Fair Queueing (BFQ) disk scheduler.

1204

++ *

1205

++ * Based on ideas and code from CFQ:

1206

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1207

++ *

1208

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1209

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1210

++ *

1211

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1212

++ *

1213

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

1214

++ *

1215

++ * BFQ is a proportional share disk scheduling algorithm based on the

1216

++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

1217

++ * number of sectors, to tasks instead of time slices. The disk is not granted

1218

++ * to the in-service task for a given time slice, but until it has exhausted

1219

++ * its assigned budget. This change from the time to the service domain allows

1220

++ * BFQ to distribute the disk bandwidth among tasks as desired, without any

1221

++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

1222

++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

1223

++ * their budgets (more precisely BFQ schedules queues associated to tasks).

1224

++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

1225

++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

1226

++ * latencies to interactive and soft real-time applications.

1227

++ *

1228

++ * BFQ is described in [1], where also a reference to the initial, more

1229

++ * theoretical paper on BFQ can be found. The interested reader can find in

1230

++ * the latter paper full details on the main algorithm as well as formulas of

1231

++ * the guarantees, plus formal proofs of all the properties. With respect to

1232

++ * the version of BFQ presented in these papers, this implementation adds a

1233

++ * few more heuristics, such as the one that guarantees a low latency to soft

1234

++ * real-time applications, and a hierarchical extension based on H-WF2Q+.

1235

++ *

1236

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1237

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1238

++ * complexity derives from the one introduced with EEVDF in [3].

1239

++ *

1240

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1241

++ *     with the BFQ Disk I/O Scheduler'',

1242

++ *     Proceedings of the 5th Annual International Systems and Storage

1243

++ *     Conference (SYSTOR '12), June 2012.

1244

++ *

1245

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1246

++ *

1247

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1248

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1249

++ *     Oct 1997.

1250

++ *

1251

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1252

++ *

1253

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1254

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1255

++ *     Resource Allocation,'' technical report.

1256

++ *

1257

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1258

++ */

1259

++#include <linux/module.h>

1260

++#include <linux/slab.h>

1261

++#include <linux/blkdev.h>

1262

++#include <linux/cgroup.h>

1263

++#include <linux/elevator.h>

1264

++#include <linux/jiffies.h>

1265

++#include <linux/rbtree.h>

1266

++#include <linux/ioprio.h>

1267

++#include "bfq.h"

1268

++#include "blk.h"

1269

++

1270

++/* Max number of dispatches in one round of service. */

1271

++static const int bfq_quantum = 4;

1272

++

1273

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1274

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1275

++

1276

++/* Maximum backwards seek, in KiB. */

1277

++static const int bfq_back_max = 16 * 1024;

1278

++

1279

++/* Penalty of a backwards seek, in number of sectors. */

1280

++static const int bfq_back_penalty = 2;

1281

++

1282

++/* Idling period duration, in jiffies. */

1283

++static int bfq_slice_idle = HZ / 125;

1284

++

1285

++/* Default maximum budget values, in sectors and number of requests. */

1286

++static const int bfq_default_max_budget = 16 * 1024;

1287

++static const int bfq_max_budget_async_rq = 4;

1288

++

1289

++/*

1290

++ * Async to sync throughput distribution is controlled as follows:

1291

++ * when an async request is served, the entity is charged the number

1292

++ * of sectors of the request, multiplied by the factor below

1293

++ */

1294

++static const int bfq_async_charge_factor = 10;

1295

++

1296

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1297

++static const int bfq_timeout_sync = HZ / 8;

1298

++static int bfq_timeout_async = HZ / 25;

1299

++

1300

++struct kmem_cache *bfq_pool;

1301

++

1302

++/* Below this threshold (in ms), we consider thinktime immediate. */

1303

++#define BFQ_MIN_TT		2

1304

++

1305

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1306

++#define BFQ_HW_QUEUE_THRESHOLD	4

1307

++#define BFQ_HW_QUEUE_SAMPLES	32

1308

++

1309

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1310

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1311

++

1312

++/* Min samples used for peak rate estimation (for autotuning). */

1313

++#define BFQ_PEAK_RATE_SAMPLES	32

1314

++

1315

++/* Shift used for peak rate fixed precision calculations. */

1316

++#define BFQ_RATE_SHIFT		16

1317

++

1318

++/*

1319

++ * The duration of the weight raising for interactive applications is

1320

++ * computed automatically (as default behaviour), using the following

1321

++ * formula: duration = (R / r) * T, where r is the peak rate of the

1322

++ * disk, and R and T are two reference parameters. In particular, R is

1323

++ * the peak rate of a reference disk, and T is about the maximum time

1324

++ * for starting popular large applications on that disk, under BFQ and

1325

++ * while reading two files in parallel. Finally, BFQ uses two

1326

++ * different pairs (R, T) depending on whether the disk is rotational

1327

++ * or non-rotational.

1328

++ */

1329

++#define T_rot			(msecs_to_jiffies(5500))

1330

++#define T_nonrot		(msecs_to_jiffies(2000))

1331

++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

1332

++#define R_rot			17415

1333

++#define R_nonrot		34791

1334

++

1335

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1336

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1337

++

1338

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1339

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1340

++

1341

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1342

++

1343

++#include "bfq-ioc.c"

1344

++#include "bfq-sched.c"

1345

++#include "bfq-cgroup.c"

1346

++

1347

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1348

++				 IOPRIO_CLASS_IDLE)

1349

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1350

++				 IOPRIO_CLASS_RT)

1351

++

1352

++#define bfq_sample_valid(samples)	((samples) > 80)

1353

++

1354

++/*

1355

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1356

++ * set (in which case it could also be a direct WRITE).

1357

++ */

1358

++static inline int bfq_bio_sync(struct bio *bio)

1359

++{

1360

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1361

++		return 1;

1362

++

1363

++	return 0;

1364

++}

1365

++

1366

++/*

1367

++ * Scheduler run of queue, if there are requests pending and no one in the

1368

++ * driver that will restart queueing.

1369

++ */

1370

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1371

++{

1372

++	if (bfqd->queued != 0) {

1373

++		bfq_log(bfqd, "schedule dispatch");

1374

++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

1375

++	}

1376

++}

1377

++

1378

++/*

1379

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1380

++ * We choose the request that is closesr to the head right now.  Distance

1381

++ * behind the head is penalized and only allowed to a certain extent.

1382

++ */

1383

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1384

++				      struct request *rq1,

1385

++				      struct request *rq2,

1386

++				      sector_t last)

1387

++{

1388

++	sector_t s1, s2, d1 = 0, d2 = 0;

1389

++	unsigned long back_max;

1390

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1391

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1392

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1393

++

1394

++	if (rq1 == NULL || rq1 == rq2)

1395

++		return rq2;

1396

++	if (rq2 == NULL)

1397

++		return rq1;

1398

++

1399

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1400

++		return rq1;

1401

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1402

++		return rq2;

1403

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1404

++		return rq1;

1405

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1406

++		return rq2;

1407

++

1408

++	s1 = blk_rq_pos(rq1);

1409

++	s2 = blk_rq_pos(rq2);

1410

++

1411

++	/*

1412

++	 * By definition, 1KiB is 2 sectors.

1413

++	 */

1414

++	back_max = bfqd->bfq_back_max * 2;

1415

++

1416

++	/*

1417

++	 * Strict one way elevator _except_ in the case where we allow

1418

++	 * short backward seeks which are biased as twice the cost of a

1419

++	 * similar forward seek.

1420

++	 */

1421

++	if (s1 >= last)

1422

++		d1 = s1 - last;

1423

++	else if (s1 + back_max >= last)

1424

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1425

++	else

1426

++		wrap |= BFQ_RQ1_WRAP;

1427

++

1428

++	if (s2 >= last)

1429

++		d2 = s2 - last;

1430

++	else if (s2 + back_max >= last)

1431

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1432

++	else

1433

++		wrap |= BFQ_RQ2_WRAP;

1434

++

1435

++	/* Found required data */

1436

++

1437

++	/*

1438

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1439

++	 * check two variables for all permutations: --> faster!

1440

++	 */

1441

++	switch (wrap) {

1442

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1443

++		if (d1 < d2)

1444

++			return rq1;

1445

++		else if (d2 < d1)

1446

++			return rq2;

1447

++		else {

1448

++			if (s1 >= s2)

1449

++				return rq1;

1450

++			else

1451

++				return rq2;

1452

++		}

1453

++

1454

++	case BFQ_RQ2_WRAP:

1455

++		return rq1;

1456

++	case BFQ_RQ1_WRAP:

1457

++		return rq2;

1458

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1459

++	default:

1460

++		/*

1461

++		 * Since both rqs are wrapped,

1462

++		 * start with the one that's further behind head

1463

++		 * (--> only *one* back seek required),

1464

++		 * since back seek takes more time than forward.

1465

++		 */

1466

++		if (s1 <= s2)

1467

++			return rq1;

1468

++		else

1469

++			return rq2;

1470

++	}

1471

++}

1472

++

1473

++static struct bfq_queue *

1474

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1475

++		     sector_t sector, struct rb_node **ret_parent,

1476

++		     struct rb_node ***rb_link)

1477

++{

1478

++	struct rb_node **p, *parent;

1479

++	struct bfq_queue *bfqq = NULL;

1480

++

1481

++	parent = NULL;

1482

++	p = &root->rb_node;

1483

++	while (*p) {

1484

++		struct rb_node **n;

1485

++

1486

++		parent = *p;

1487

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1488

++

1489

++		/*

1490

++		 * Sort strictly based on sector. Smallest to the left,

1491

++		 * largest to the right.

1492

++		 */

1493

++		if (sector > blk_rq_pos(bfqq->next_rq))

1494

++			n = &(*p)->rb_right;

1495

++		else if (sector < blk_rq_pos(bfqq->next_rq))

1496

++			n = &(*p)->rb_left;

1497

++		else

1498

++			break;

1499

++		p = n;

1500

++		bfqq = NULL;

1501

++	}

1502

++

1503

++	*ret_parent = parent;

1504

++	if (rb_link)

1505

++		*rb_link = p;

1506

++

1507

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1508

++		(long long unsigned)sector,

1509

++		bfqq != NULL ? bfqq->pid : 0);

1510

++

1511

++	return bfqq;

1512

++}

1513

++

1514

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1515

++{

1516

++	struct rb_node **p, *parent;

1517

++	struct bfq_queue *__bfqq;

1518

++

1519

++	if (bfqq->pos_root != NULL) {

1520

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1521

++		bfqq->pos_root = NULL;

1522

++	}

1523

++

1524

++	if (bfq_class_idle(bfqq))

1525

++		return;

1526

++	if (!bfqq->next_rq)

1527

++		return;

1528

++

1529

++	bfqq->pos_root = &bfqd->rq_pos_tree;

1530

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1531

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

1532

++	if (__bfqq == NULL) {

1533

++		rb_link_node(&bfqq->pos_node, parent, p);

1534

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1535

++	} else

1536

++		bfqq->pos_root = NULL;

1537

++}

1538

++

1539

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1540

++					struct bfq_queue *bfqq,

1541

++					struct request *last)

1542

++{

1543

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1544

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1545

++	struct request *next = NULL, *prev = NULL;

1546

++

1547

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1548

++

1549

++	if (rbprev != NULL)

1550

++		prev = rb_entry_rq(rbprev);

1551

++

1552

++	if (rbnext != NULL)

1553

++		next = rb_entry_rq(rbnext);

1554

++	else {

1555

++		rbnext = rb_first(&bfqq->sort_list);

1556

++		if (rbnext && rbnext != &last->rb_node)

1557

++			next = rb_entry_rq(rbnext);

1558

++	}

1559

++

1560

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1561

++}

1562

++

1563

++static void bfq_del_rq_rb(struct request *rq)

1564

++{

1565

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1566

++	struct bfq_data *bfqd = bfqq->bfqd;

1567

++	const int sync = rq_is_sync(rq);

1568

++

1569

++	BUG_ON(bfqq->queued[sync] == 0);

1570

++	bfqq->queued[sync]--;

1571

++	bfqd->queued--;

1572

++

1573

++	elv_rb_del(&bfqq->sort_list, rq);

1574

++

1575

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

1576

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

1577

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

1578

++		/*

1579

++		 * Remove queue from request-position tree as it is empty.

1580

++		 */

1581

++		if (bfqq->pos_root != NULL) {

1582

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

1583

++			bfqq->pos_root = NULL;

1584

++		}

1585

++	}

1586

++}

1587

++

1588

++/* see the definition of bfq_async_charge_factor for details */

1589

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

1590

++					       struct bfq_queue *bfqq)

1591

++{

1592

++	return blk_rq_sectors(rq) *

1593

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

1594

++		bfq_async_charge_factor));

1595

++}

1596

++

1597

++/**

1598

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1599

++ * @bfqd: the device data the queue belongs to.

1600

++ * @bfqq: the queue to update.

1601

++ *

1602

++ * If the first request of a queue changes we make sure that the queue

1603

++ * has enough budget to serve at least its first request (if the

1604

++ * request has grown).  We do this because if the queue has not enough

1605

++ * budget for its first request, it has to go through two dispatch

1606

++ * rounds to actually get it dispatched.

1607

++ */

1608

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1609

++				 struct bfq_queue *bfqq)

1610

++{

1611

++	struct bfq_entity *entity = &bfqq->entity;

1612

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1613

++	struct request *next_rq = bfqq->next_rq;

1614

++	unsigned long new_budget;

1615

++

1616

++	if (next_rq == NULL)

1617

++		return;

1618

++

1619

++	if (bfqq == bfqd->in_service_queue)

1620

++		/*

1621

++		 * In order not to break guarantees, budgets cannot be

1622

++		 * changed after an entity has been selected.

1623

++		 */

1624

++		return;

1625

++

1626

++	BUG_ON(entity->tree != &st->active);

1627

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1628

++

1629

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1630

++			   bfq_serv_to_charge(next_rq, bfqq));

1631

++	entity->budget = new_budget;

1632

++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

1633

++	bfq_activate_bfqq(bfqd, bfqq);

1634

++}

1635

++

1636

++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

1637

++{

1638

++	u64 dur;

1639

++

1640

++	if (bfqd->bfq_raising_max_time > 0)

1641

++		return bfqd->bfq_raising_max_time;

1642

++

1643

++	dur = bfqd->RT_prod;

1644

++	do_div(dur, bfqd->peak_rate);

1645

++

1646

++	return dur;

1647

++}

1648

++

1649

++static void bfq_add_rq_rb(struct request *rq)

1650

++{

1651

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1652

++	struct bfq_entity *entity = &bfqq->entity;

1653

++	struct bfq_data *bfqd = bfqq->bfqd;

1654

++	struct request *next_rq, *prev;

1655

++	unsigned long old_raising_coeff = bfqq->raising_coeff;

1656

++	int idle_for_long_time = 0;

1657

++

1658

++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

1659

++	bfqq->queued[rq_is_sync(rq)]++;

1660

++	bfqd->queued++;

1661

++

1662

++	elv_rb_add(&bfqq->sort_list, rq);

1663

++

1664

++	/*

1665

++	 * Check if this request is a better next-serve candidate.

1666

++	 */

1667

++	prev = bfqq->next_rq;

1668

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

1669

++	BUG_ON(next_rq == NULL);

1670

++	bfqq->next_rq = next_rq;

1671

++

1672

++	/*

1673

++	 * Adjust priority tree position, if next_rq changes.

1674

++	 */

1675

++	if (prev != bfqq->next_rq)

1676

++		bfq_rq_pos_tree_add(bfqd, bfqq);

1677

++

1678

++	if (!bfq_bfqq_busy(bfqq)) {

1679

++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

1680

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

1681

++		idle_for_long_time = time_is_before_jiffies(

1682

++			bfqq->budget_timeout +

1683

++			bfqd->bfq_raising_min_idle_time);

1684

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

1685

++				       bfq_serv_to_charge(next_rq, bfqq));

1686

++

1687

++		if (!bfqd->low_latency)

1688

++			goto add_bfqq_busy;

1689

++

1690

++		/*

1691

++		 * If the queue is not being boosted and has been idle

1692

++		 * for enough time, start a weight-raising period

1693

++		 */

1694

++		if (old_raising_coeff == 1 &&

1695

++		    (idle_for_long_time || soft_rt)) {

1696

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1697

++			if (idle_for_long_time)

1698

++				bfqq->raising_cur_max_time =

1699

++					bfq_wrais_duration(bfqd);

1700

++			else

1701

++				bfqq->raising_cur_max_time =

1702

++					bfqd->bfq_raising_rt_max_time;

1703

++			bfq_log_bfqq(bfqd, bfqq,

1704

++				     "wrais starting at %lu, "

1705

++				     "rais_max_time %u",

1706

++				     jiffies,

1707

++				     jiffies_to_msecs(bfqq->

1708

++					raising_cur_max_time));

1709

++		} else if (old_raising_coeff > 1) {

1710

++			if (idle_for_long_time)

1711

++				bfqq->raising_cur_max_time =

1712

++					bfq_wrais_duration(bfqd);

1713

++			else if (bfqq->raising_cur_max_time ==

1714

++				 bfqd->bfq_raising_rt_max_time &&

1715

++				 !soft_rt) {

1716

++				bfqq->raising_coeff = 1;

1717

++				bfq_log_bfqq(bfqd, bfqq,

1718

++					     "wrais ending at %lu, "

1719

++					     "rais_max_time %u",

1720

++					     jiffies,

1721

++					     jiffies_to_msecs(bfqq->

1722

++						raising_cur_max_time));

1723

++			} else if (time_before(

1724

++					bfqq->last_rais_start_finish +

1725

++					bfqq->raising_cur_max_time,

1726

++					jiffies +

1727

++					bfqd->bfq_raising_rt_max_time) &&

1728

++				   soft_rt) {

1729

++				/*

1730

++				 *

1731

++				 * The remaining weight-raising time is lower

1732

++				 * than bfqd->bfq_raising_rt_max_time, which

1733

++				 * means that the application is enjoying

1734

++				 * weight raising either because deemed soft-

1735

++				 * rt in the near past, or because deemed

1736

++				 * interactive a long ago. In both cases,

1737

++				 * resetting now the current remaining weight-

1738

++				 * raising time for the application to the

1739

++				 * weight-raising duration for soft rt

1740

++				 * applications would not cause any latency

1741

++				 * increase for the application (as the new

1742

++				 * duration would be higher than the remaining

1743

++				 * time).

1744

++				 *

1745

++				 * In addition, the application is now meeting

1746

++				 * the requirements for being deemed soft rt.

1747

++				 * In the end we can correctly and safely

1748

++				 * (re)charge the weight-raising duration for

1749

++				 * the application with the weight-raising

1750

++				 * duration for soft rt applications.

1751

++				 *

1752

++				 * In particular, doing this recharge now, i.e.,

1753

++				 * before the weight-raising period for the

1754

++				 * application finishes, reduces the probability

1755

++				 * of the following negative scenario:

1756

++				 * 1) the weight of a soft rt application is

1757

++				 *    raised at startup (as for any newly

1758

++				 *    created application),

1759

++				 * 2) since the application is not interactive,

1760

++				 *    at a certain time weight-raising is

1761

++				 *    stopped for the application,

1762

++				 * 3) at that time the application happens to

1763

++				 *    still have pending requests, and hence

1764

++				 *    is destined to not have a chance to be

1765

++				 *    deemed soft rt before these requests are

1766

++				 *    completed (see the comments to the

1767

++				 *    function bfq_bfqq_softrt_next_start()

1768

++				 *    for details on soft rt detection),

1769

++				 * 4) these pending requests experience a high

1770

++				 *    latency because the application is not

1771

++				 *    weight-raised while they are pending.

1772

++				 */

1773

++				bfqq->last_rais_start_finish = jiffies;

1774

++				bfqq->raising_cur_max_time =

1775

++					bfqd->bfq_raising_rt_max_time;

1776

++			}

1777

++		}

1778

++		if (old_raising_coeff != bfqq->raising_coeff)

1779

++			entity->ioprio_changed = 1;

1780

++add_bfqq_busy:

1781

++		bfqq->last_idle_bklogged = jiffies;

1782

++		bfqq->service_from_backlogged = 0;

1783

++		bfq_clear_bfqq_softrt_update(bfqq);

1784

++		bfq_add_bfqq_busy(bfqd, bfqq);

1785

++	} else {

1786

++		if (bfqd->low_latency && old_raising_coeff == 1 &&

1787

++			!rq_is_sync(rq) &&

1788

++			time_is_before_jiffies(

1789

++				bfqq->last_rais_start_finish +

1790

++				bfqd->bfq_raising_min_inter_arr_async)) {

1791

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1792

++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

1793

++

1794

++			bfqd->raised_busy_queues++;

1795

++			entity->ioprio_changed = 1;

1796

++			bfq_log_bfqq(bfqd, bfqq,

1797

++				     "non-idle wrais starting at %lu, "

1798

++				     "rais_max_time %u",

1799

++				     jiffies,

1800

++				     jiffies_to_msecs(bfqq->

1801

++					raising_cur_max_time));

1802

++		}

1803

++		bfq_updated_next_req(bfqd, bfqq);

1804

++	}

1805

++

1806

++	if (bfqd->low_latency &&

1807

++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

1808

++		 idle_for_long_time))

1809

++		bfqq->last_rais_start_finish = jiffies;

1810

++}

1811

++

1812

++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

1813

++{

1814

++	elv_rb_del(&bfqq->sort_list, rq);

1815

++	bfqq->queued[rq_is_sync(rq)]--;

1816

++	bfqq->bfqd->queued--;

1817

++	bfq_add_rq_rb(rq);

1818

++}

1819

++

1820

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

1821

++					  struct bio *bio)

1822

++{

1823

++	struct task_struct *tsk = current;

1824

++	struct bfq_io_cq *bic;

1825

++	struct bfq_queue *bfqq;

1826

++

1827

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

1828

++	if (bic == NULL)

1829

++		return NULL;

1830

++

1831

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1832

++	if (bfqq != NULL)

1833

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

1834

++

1835

++	return NULL;

1836

++}

1837

++

1838

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

1839

++{

1840

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1841

++

1842

++	bfqd->rq_in_driver++;

1843

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1844

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

1845

++		(long long unsigned)bfqd->last_position);

1846

++}

1847

++

1848

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

1849

++{

1850

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1851

++

1852

++	WARN_ON(bfqd->rq_in_driver == 0);

1853

++	bfqd->rq_in_driver--;

1854

++}

1855

++

1856

++static void bfq_remove_request(struct request *rq)

1857

++{

1858

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1859

++	struct bfq_data *bfqd = bfqq->bfqd;

1860

++

1861

++	if (bfqq->next_rq == rq) {

1862

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

1863

++		bfq_updated_next_req(bfqd, bfqq);

1864

++	}

1865

++

1866

++	list_del_init(&rq->queuelist);

1867

++	bfq_del_rq_rb(rq);

1868

++

1869

++	if (rq->cmd_flags & REQ_META) {

1870

++		WARN_ON(bfqq->meta_pending == 0);

1871

++		bfqq->meta_pending--;

1872

++	}

1873

++}

1874

++

1875

++static int bfq_merge(struct request_queue *q, struct request **req,

1876

++		     struct bio *bio)

1877

++{

1878

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1879

++	struct request *__rq;

1880

++

1881

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

1882

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

1883

++		*req = __rq;

1884

++		return ELEVATOR_FRONT_MERGE;

1885

++	}

1886

++

1887

++	return ELEVATOR_NO_MERGE;

1888

++}

1889

++

1890

++static void bfq_merged_request(struct request_queue *q, struct request *req,

1891

++			       int type)

1892

++{

1893

++	if (type == ELEVATOR_FRONT_MERGE) {

1894

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

1895

++

1896

++		bfq_reposition_rq_rb(bfqq, req);

1897

++	}

1898

++}

1899

++

1900

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

1901

++				struct request *next)

1902

++{

1903

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1904

++

1905

++	/*

1906

++	 * Reposition in fifo if next is older than rq.

1907

++	 */

1908

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1909

++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1910

++		list_move(&rq->queuelist, &next->queuelist);

1911

++		rq_set_fifo_time(rq, rq_fifo_time(next));

1912

++	}

1913

++

1914

++	if (bfqq->next_rq == next)

1915

++		bfqq->next_rq = rq;

1916

++

1917

++	bfq_remove_request(next);

1918

++}

1919

++

1920

++/* Must be called with bfqq != NULL */

1921

++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

1922

++{

1923

++	BUG_ON(bfqq == NULL);

1924

++	if (bfq_bfqq_busy(bfqq))

1925

++		bfqq->bfqd->raised_busy_queues--;

1926

++	bfqq->raising_coeff = 1;

1927

++	bfqq->raising_cur_max_time = 0;

1928

++	/* Trigger a weight change on the next activation of the queue */

1929

++	bfqq->entity.ioprio_changed = 1;

1930

++}

1931

++

1932

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

1933

++					struct bfq_group *bfqg)

1934

++{

1935

++	int i, j;

1936

++

1937

++	for (i = 0; i < 2; i++)

1938

++		for (j = 0; j < IOPRIO_BE_NR; j++)

1939

++			if (bfqg->async_bfqq[i][j] != NULL)

1940

++				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

1941

++	if (bfqg->async_idle_bfqq != NULL)

1942

++		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

1943

++}

1944

++

1945

++static void bfq_end_raising(struct bfq_data *bfqd)

1946

++{

1947

++	struct bfq_queue *bfqq;

1948

++

1949

++	spin_lock_irq(bfqd->queue->queue_lock);

1950

++

1951

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

1952

++		bfq_bfqq_end_raising(bfqq);

1953

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

1954

++		bfq_bfqq_end_raising(bfqq);

1955

++	bfq_end_raising_async(bfqd);

1956

++

1957

++	spin_unlock_irq(bfqd->queue->queue_lock);

1958

++}

1959

++

1960

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

1961

++			   struct bio *bio)

1962

++{

1963

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1964

++	struct bfq_io_cq *bic;

1965

++	struct bfq_queue *bfqq;

1966

++

1967

++	/*

1968

++	 * Disallow merge of a sync bio into an async request.

1969

++	 */

1970

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

1971

++		return 0;

1972

++

1973

++	/*

1974

++	 * Lookup the bfqq that this bio will be queued with. Allow

1975

++	 * merge only if rq is queued there.

1976

++	 * Queue lock is held here.

1977

++	 */

1978

++	bic = bfq_bic_lookup(bfqd, current->io_context);

1979

++	if (bic == NULL)

1980

++		return 0;

1981

++

1982

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1983

++	return bfqq == RQ_BFQQ(rq);

1984

++}

1985

++

1986

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

1987

++				       struct bfq_queue *bfqq)

1988

++{

1989

++	if (bfqq != NULL) {

1990

++		bfq_mark_bfqq_must_alloc(bfqq);

1991

++		bfq_mark_bfqq_budget_new(bfqq);

1992

++		bfq_clear_bfqq_fifo_expire(bfqq);

1993

++

1994

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

1995

++

1996

++		bfq_log_bfqq(bfqd, bfqq,

1997

++			     "set_in_service_queue, cur-budget = %lu",

1998

++			     bfqq->entity.budget);

1999

++	}

2000

++

2001

++	bfqd->in_service_queue = bfqq;

2002

++}

2003

++

2004

++/*

2005

++ * Get and set a new queue for service.

2006

++ */

2007

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2008

++						  struct bfq_queue *bfqq)

2009

++{

2010

++	if (!bfqq)

2011

++		bfqq = bfq_get_next_queue(bfqd);

2012

++	else

2013

++		bfq_get_next_queue_forced(bfqd, bfqq);

2014

++

2015

++	__bfq_set_in_service_queue(bfqd, bfqq);

2016

++	return bfqq;

2017

++}

2018

++

2019

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2020

++					  struct request *rq)

2021

++{

2022

++	if (blk_rq_pos(rq) >= bfqd->last_position)

2023

++		return blk_rq_pos(rq) - bfqd->last_position;

2024

++	else

2025

++		return bfqd->last_position - blk_rq_pos(rq);

2026

++}

2027

++

2028

++/*

2029

++ * Return true if bfqq has no request pending and rq is close enough to

2030

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2031

++ * bfqq->next_rq

2032

++ */

2033

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2034

++{

2035

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2036

++}

2037

++

2038

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2039

++{

2040

++	struct rb_root *root = &bfqd->rq_pos_tree;

2041

++	struct rb_node *parent, *node;

2042

++	struct bfq_queue *__bfqq;

2043

++	sector_t sector = bfqd->last_position;

2044

++

2045

++	if (RB_EMPTY_ROOT(root))

2046

++		return NULL;

2047

++

2048

++	/*

2049

++	 * First, if we find a request starting at the end of the last

2050

++	 * request, choose it.

2051

++	 */

2052

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2053

++	if (__bfqq != NULL)

2054

++		return __bfqq;

2055

++

2056

++	/*

2057

++	 * If the exact sector wasn't found, the parent of the NULL leaf

2058

++	 * will contain the closest sector (rq_pos_tree sorted by next_request

2059

++	 * position).

2060

++	 */

2061

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2062

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2063

++		return __bfqq;

2064

++

2065

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

2066

++		node = rb_next(&__bfqq->pos_node);

2067

++	else

2068

++		node = rb_prev(&__bfqq->pos_node);

2069

++	if (node == NULL)

2070

++		return NULL;

2071

++

2072

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2073

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2074

++		return __bfqq;

2075

++

2076

++	return NULL;

2077

++}

2078

++

2079

++/*

2080

++ * bfqd - obvious

2081

++ * cur_bfqq - passed in so that we don't decide that the current queue

2082

++ *            is closely cooperating with itself.

2083

++ *

2084

++ * We are assuming that cur_bfqq has dispatched at least one request,

2085

++ * and that bfqd->last_position reflects a position on the disk associated

2086

++ * with the I/O issued by cur_bfqq.

2087

++ */

2088

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2089

++					      struct bfq_queue *cur_bfqq)

2090

++{

2091

++	struct bfq_queue *bfqq;

2092

++

2093

++	if (bfq_class_idle(cur_bfqq))

2094

++		return NULL;

2095

++	if (!bfq_bfqq_sync(cur_bfqq))

2096

++		return NULL;

2097

++	if (BFQQ_SEEKY(cur_bfqq))

2098

++		return NULL;

2099

++

2100

++	/* If device has only one backlogged bfq_queue, don't search. */

2101

++	if (bfqd->busy_queues == 1)

2102

++		return NULL;

2103

++

2104

++	/*

2105

++	 * We should notice if some of the queues are cooperating, e.g.

2106

++	 * working closely on the same area of the disk. In that case,

2107

++	 * we can group them together and don't waste time idling.

2108

++	 */

2109

++	bfqq = bfqq_close(bfqd);

2110

++	if (bfqq == NULL || bfqq == cur_bfqq)

2111

++		return NULL;

2112

++

2113

++	/*

2114

++	 * Do not merge queues from different bfq_groups.

2115

++	*/

2116

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2117

++		return NULL;

2118

++

2119

++	/*

2120

++	 * It only makes sense to merge sync queues.

2121

++	 */

2122

++	if (!bfq_bfqq_sync(bfqq))

2123

++		return NULL;

2124

++	if (BFQQ_SEEKY(bfqq))

2125

++		return NULL;

2126

++

2127

++	/*

2128

++	 * Do not merge queues of different priority classes.

2129

++	 */

2130

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2131

++		return NULL;

2132

++

2133

++	return bfqq;

2134

++}

2135

++

2136

++/*

2137

++ * If enough samples have been computed, return the current max budget

2138

++ * stored in bfqd, which is dynamically updated according to the

2139

++ * estimated disk peak rate; otherwise return the default max budget

2140

++ */

2141

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2142

++{

2143

++	if (bfqd->budgets_assigned < 194)

2144

++		return bfq_default_max_budget;

2145

++	else

2146

++		return bfqd->bfq_max_budget;

2147

++}

2148

++

2149

++/*

2150

++ * Return min budget, which is a fraction of the current or default

2151

++ * max budget (trying with 1/32)

2152

++ */

2153

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2154

++{

2155

++	if (bfqd->budgets_assigned < 194)

2156

++		return bfq_default_max_budget / 32;

2157

++	else

2158

++		return bfqd->bfq_max_budget / 32;

2159

++}

2160

++

2161

++/*

2162

++ * Decides whether idling should be done for given device and

2163

++ * given in-service queue.

2164

++ */

2165

++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

2166

++					   struct bfq_queue *in_service_bfqq)

2167

++{

2168

++	if (in_service_bfqq == NULL)

2169

++		return false;

2170

++	/*

2171

++	 * If the device is non-rotational, and hence has no seek penalty,

2172

++	 * disable idling; but do so only if:

2173

++	 * - device does not support queuing, otherwise we still have

2174

++	 *   a problem with sync vs async workloads;

2175

++	 * - the queue is not weight-raised, to preserve guarantees.

2176

++	 */

2177

++	return blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

2178

++	       (in_service_bfqq->raising_coeff == 1);

2179

++}

2180

++

2181

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2182

++{

2183

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2184

++	struct bfq_io_cq *bic;

2185

++	unsigned long sl;

2186

++

2187

++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2188

++

2189

++	/* Tasks have exited, don't wait. */

2190

++	bic = bfqd->in_service_bic;

2191

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2192

++		return;

2193

++

2194

++	bfq_mark_bfqq_wait_request(bfqq);

2195

++

2196

++	/*

2197

++	 * We don't want to idle for seeks, but we do want to allow

2198

++	 * fair distribution of slice time for a process doing back-to-back

2199

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2200

++	 *

2201

++	 * To prevent processes with (partly) seeky workloads from

2202

++	 * being too ill-treated, grant them a small fraction of the

2203

++	 * assigned budget before reducing the waiting time to

2204

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2205

++	 */

2206

++	sl = bfqd->bfq_slice_idle;

2207

++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

2208

++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

2209

++	    bfqq->raising_coeff == 1)

2210

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2211

++	else if (bfqq->raising_coeff > 1)

2212

++		sl = sl * 3;

2213

++	bfqd->last_idling_start = ktime_get();

2214

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2215

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2216

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2217

++}

2218

++

2219

++/*

2220

++ * Set the maximum time for the in-service queue to consume its

2221

++ * budget. This prevents seeky processes from lowering the disk

2222

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2223

++ */

2224

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2225

++{

2226

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2227

++	unsigned int timeout_coeff;

2228

++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

2229

++		timeout_coeff = 1;

2230

++	else

2231

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2232

++

2233

++	bfqd->last_budget_start = ktime_get();

2234

++

2235

++	bfq_clear_bfqq_budget_new(bfqq);

2236

++	bfqq->budget_timeout = jiffies +

2237

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2238

++

2239

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2240

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2241

++		timeout_coeff));

2242

++}

2243

++

2244

++/*

2245

++ * Move request from internal lists to the request queue dispatch list.

2246

++ */

2247

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2248

++{

2249

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2250

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2251

++

2252

++	bfq_remove_request(rq);

2253

++	bfqq->dispatched++;

2254

++	elv_dispatch_sort(q, rq);

2255

++

2256

++	if (bfq_bfqq_sync(bfqq))

2257

++		bfqd->sync_flight++;

2258

++}

2259

++

2260

++/*

2261

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2262

++ */

2263

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2264

++{

2265

++	struct request *rq = NULL;

2266

++

2267

++	if (bfq_bfqq_fifo_expire(bfqq))

2268

++		return NULL;

2269

++

2270

++	bfq_mark_bfqq_fifo_expire(bfqq);

2271

++

2272

++	if (list_empty(&bfqq->fifo))

2273

++		return NULL;

2274

++

2275

++	rq = rq_entry_fifo(bfqq->fifo.next);

2276

++

2277

++	if (time_before(jiffies, rq_fifo_time(rq)))

2278

++		return NULL;

2279

++

2280

++	return rq;

2281

++}

2282

++

2283

++/*

2284

++ * Must be called with the queue_lock held.

2285

++ */

2286

++static int bfqq_process_refs(struct bfq_queue *bfqq)

2287

++{

2288

++	int process_refs, io_refs;

2289

++

2290

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2291

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2292

++	BUG_ON(process_refs < 0);

2293

++	return process_refs;

2294

++}

2295

++

2296

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2297

++{

2298

++	int process_refs, new_process_refs;

2299

++	struct bfq_queue *__bfqq;

2300

++

2301

++	/*

2302

++	 * If there are no process references on the new_bfqq, then it is

2303

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2304

++	 * may have dropped their last reference (not just their last process

2305

++	 * reference).

2306

++	 */

2307

++	if (!bfqq_process_refs(new_bfqq))

2308

++		return;

2309

++

2310

++	/* Avoid a circular list and skip interim queue merges. */

2311

++	while ((__bfqq = new_bfqq->new_bfqq)) {

2312

++		if (__bfqq == bfqq)

2313

++			return;

2314

++		new_bfqq = __bfqq;

2315

++	}

2316

++

2317

++	process_refs = bfqq_process_refs(bfqq);

2318

++	new_process_refs = bfqq_process_refs(new_bfqq);

2319

++	/*

2320

++	 * If the process for the bfqq has gone away, there is no

2321

++	 * sense in merging the queues.

2322

++	 */

2323

++	if (process_refs == 0 || new_process_refs == 0)

2324

++		return;

2325

++

2326

++	/*

2327

++	 * Merge in the direction of the lesser amount of work.

2328

++	 */

2329

++	if (new_process_refs >= process_refs) {

2330

++		bfqq->new_bfqq = new_bfqq;

2331

++		atomic_add(process_refs, &new_bfqq->ref);

2332

++	} else {

2333

++		new_bfqq->new_bfqq = bfqq;

2334

++		atomic_add(new_process_refs, &bfqq->ref);

2335

++	}

2336

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2337

++		new_bfqq->pid);

2338

++}

2339

++

2340

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2341

++{

2342

++	struct bfq_entity *entity = &bfqq->entity;

2343

++	return entity->budget - entity->service;

2344

++}

2345

++

2346

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2347

++{

2348

++	BUG_ON(bfqq != bfqd->in_service_queue);

2349

++

2350

++	__bfq_bfqd_reset_in_service(bfqd);

2351

++

2352

++	/*

2353

++	 * If this bfqq is shared between multiple processes, check

2354

++	 * to make sure that those processes are still issuing I/Os

2355

++	 * within the mean seek distance. If not, it may be time to

2356

++	 * break the queues apart again.

2357

++	 */

2358

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2359

++		bfq_mark_bfqq_split_coop(bfqq);

2360

++

2361

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2362

++		/*

2363

++		 * overloading budget_timeout field to store when

2364

++		 * the queue remains with no backlog, used by

2365

++		 * the weight-raising mechanism

2366

++		 */

2367

++		bfqq->budget_timeout = jiffies;

2368

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2369

++	} else {

2370

++		bfq_activate_bfqq(bfqd, bfqq);

2371

++		/*

2372

++		 * Resort priority tree of potential close cooperators.

2373

++		 */

2374

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2375

++	}

2376

++}

2377

++

2378

++/**

2379

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2380

++ * @bfqd: device data.

2381

++ * @bfqq: queue to update.

2382

++ * @reason: reason for expiration.

2383

++ *

2384

++ * Handle the feedback on @bfqq budget.  See the body for detailed

2385

++ * comments.

2386

++ */

2387

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2388

++				     struct bfq_queue *bfqq,

2389

++				     enum bfqq_expiration reason)

2390

++{

2391

++	struct request *next_rq;

2392

++	unsigned long budget, min_budget;

2393

++

2394

++	budget = bfqq->max_budget;

2395

++	min_budget = bfq_min_budget(bfqd);

2396

++

2397

++	BUG_ON(bfqq != bfqd->in_service_queue);

2398

++

2399

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2400

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2401

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2402

++		budget, bfq_min_budget(bfqd));

2403

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2404

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2405

++

2406

++	if (bfq_bfqq_sync(bfqq)) {

2407

++		switch (reason) {

2408

++		/*

2409

++		 * Caveat: in all the following cases we trade latency

2410

++		 * for throughput.

2411

++		 */

2412

++		case BFQ_BFQQ_TOO_IDLE:

2413

++			/*

2414

++			 * This is the only case where we may reduce

2415

++			 * the budget: if there is no request of the

2416

++			 * process still waiting for completion, then

2417

++			 * we assume (tentatively) that the timer has

2418

++			 * expired because the batch of requests of

2419

++			 * the process could have been served with a

2420

++			 * smaller budget.  Hence, betting that

2421

++			 * process will behave in the same way when it

2422

++			 * becomes backlogged again, we reduce its

2423

++			 * next budget.  As long as we guess right,

2424

++			 * this budget cut reduces the latency

2425

++			 * experienced by the process.

2426

++			 *

2427

++			 * However, if there are still outstanding

2428

++			 * requests, then the process may have not yet

2429

++			 * issued its next request just because it is

2430

++			 * still waiting for the completion of some of

2431

++			 * the still outstanding ones.  So in this

2432

++			 * subcase we do not reduce its budget, on the

2433

++			 * contrary we increase it to possibly boost

2434

++			 * the throughput, as discussed in the

2435

++			 * comments to the BUDGET_TIMEOUT case.

2436

++			 */

2437

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2438

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2439

++			else {

2440

++				if (budget > 5 * min_budget)

2441

++					budget -= 4 * min_budget;

2442

++				else

2443

++					budget = min_budget;

2444

++			}

2445

++			break;

2446

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2447

++			/*

2448

++			 * We double the budget here because: 1) it

2449

++			 * gives the chance to boost the throughput if

2450

++			 * this is not a seeky process (which may have

2451

++			 * bumped into this timeout because of, e.g.,

2452

++			 * ZBR), 2) together with charge_full_budget

2453

++			 * it helps give seeky processes higher

2454

++			 * timestamps, and hence be served less

2455

++			 * frequently.

2456

++			 */

2457

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2458

++			break;

2459

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2460

++			/*

2461

++			 * The process still has backlog, and did not

2462

++			 * let either the budget timeout or the disk

2463

++			 * idling timeout expire. Hence it is not

2464

++			 * seeky, has a short thinktime and may be

2465

++			 * happy with a higher budget too. So

2466

++			 * definitely increase the budget of this good

2467

++			 * candidate to boost the disk throughput.

2468

++			 */

2469

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2470

++			break;

2471

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2472

++		       /*

2473

++			* Leave the budget unchanged.

2474

++			*/

2475

++		default:

2476

++			return;

2477

++		}

2478

++	} else /* async queue */

2479

++	    /* async queues get always the maximum possible budget

2480

++	     * (their ability to dispatch is limited by

2481

++	     * @bfqd->bfq_max_budget_async_rq).

2482

++	     */

2483

++		budget = bfqd->bfq_max_budget;

2484

++

2485

++	bfqq->max_budget = budget;

2486

++

2487

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2488

++	    bfqq->max_budget > bfqd->bfq_max_budget)

2489

++		bfqq->max_budget = bfqd->bfq_max_budget;

2490

++

2491

++	/*

2492

++	 * Make sure that we have enough budget for the next request.

2493

++	 * Since the finish time of the bfqq must be kept in sync with

2494

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2495

++	 * update.

2496

++	 */

2497

++	next_rq = bfqq->next_rq;

2498

++	if (next_rq != NULL)

2499

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2500

++					    bfq_serv_to_charge(next_rq, bfqq));

2501

++	else

2502

++		bfqq->entity.budget = bfqq->max_budget;

2503

++

2504

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2505

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2506

++			bfqq->entity.budget);

2507

++}

2508

++

2509

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2510

++{

2511

++	unsigned long max_budget;

2512

++

2513

++	/*

2514

++	 * The max_budget calculated when autotuning is equal to the

2515

++	 * amount of sectors transfered in timeout_sync at the

2516

++	 * estimated peak rate.

2517

++	 */

2518

++	max_budget = (unsigned long)(peak_rate * 1000 *

2519

++				     timeout >> BFQ_RATE_SHIFT);

2520

++

2521

++	return max_budget;

2522

++}

2523

++

2524

++/*

2525

++ * In addition to updating the peak rate, checks whether the process

2526

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2527

++ * to the budget timeout, to reduce the amount of service provided to

2528

++ * seeky processes, and hence reduce their chances to lower the

2529

++ * throughput. See the code for more details.

2530

++ */

2531

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2532

++				int compensate, enum bfqq_expiration reason)

2533

++{

2534

++	u64 bw, usecs, expected, timeout;

2535

++	ktime_t delta;

2536

++	int update = 0;

2537

++

2538

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2539

++		return 0;

2540

++

2541

++	if (compensate)

2542

++		delta = bfqd->last_idling_start;

2543

++	else

2544

++		delta = ktime_get();

2545

++	delta = ktime_sub(delta, bfqd->last_budget_start);

2546

++	usecs = ktime_to_us(delta);

2547

++

2548

++	/* Don't trust short/unrealistic values. */

2549

++	if (usecs < 100 || usecs >= LONG_MAX)

2550

++		return 0;

2551

++

2552

++	/*

2553

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2554

++	 * value to store the peak rate, in sectors per usec in fixed

2555

++	 * point math.  We do so to have enough precision in the estimate

2556

++	 * and to avoid overflows.

2557

++	 */

2558

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2559

++	do_div(bw, (unsigned long)usecs);

2560

++

2561

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2562

++

2563

++	/*

2564

++	 * Use only long (> 20ms) intervals to filter out spikes for

2565

++	 * the peak rate estimation.

2566

++	 */

2567

++	if (usecs > 20000) {

2568

++		if (bw > bfqd->peak_rate ||

2569

++		   (!BFQQ_SEEKY(bfqq) &&

2570

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2571

++			bfq_log(bfqd, "measured bw =%llu", bw);

2572

++			/*

2573

++			 * To smooth oscillations use a low-pass filter with

2574

++			 * alpha=7/8, i.e.,

2575

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

2576

++			 */

2577

++			do_div(bw, 8);

2578

++			if (bw == 0)

2579

++				return 0;

2580

++			bfqd->peak_rate *= 7;

2581

++			do_div(bfqd->peak_rate, 8);

2582

++			bfqd->peak_rate += bw;

2583

++			update = 1;

2584

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2585

++		}

2586

++

2587

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2588

++

2589

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2590

++			bfqd->peak_rate_samples++;

2591

++

2592

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

2593

++		    update && bfqd->bfq_user_max_budget == 0) {

2594

++			bfqd->bfq_max_budget =

2595

++				bfq_calc_max_budget(bfqd->peak_rate, timeout);

2596

++			bfq_log(bfqd, "new max_budget=%lu",

2597

++				bfqd->bfq_max_budget);

2598

++		}

2599

++	}

2600

++

2601

++	/*

2602

++	 * If the process has been served for a too short time

2603

++	 * interval to let its possible sequential accesses prevail on

2604

++	 * the initial seek time needed to move the disk head on the

2605

++	 * first sector it requested, then give the process a chance

2606

++	 * and for the moment return false.

2607

++	 */

2608

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

2609

++		return 0;

2610

++

2611

++	/*

2612

++	 * A process is considered ``slow'' (i.e., seeky, so that we

2613

++	 * cannot treat it fairly in the service domain, as it would

2614

++	 * slow down too much the other processes) if, when a slice

2615

++	 * ends for whatever reason, it has received service at a

2616

++	 * rate that would not be high enough to complete the budget

2617

++	 * before the budget timeout expiration.

2618

++	 */

2619

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

2620

++

2621

++	/*

2622

++	 * Caveat: processes doing IO in the slower disk zones will

2623

++	 * tend to be slow(er) even if not seeky. And the estimated

2624

++	 * peak rate will actually be an average over the disk

2625

++	 * surface. Hence, to not be too harsh with unlucky processes,

2626

++	 * we keep a budget/3 margin of safety before declaring a

2627

++	 * process slow.

2628

++	 */

2629

++	return expected > (4 * bfqq->entity.budget) / 3;

2630

++}

2631

++

2632

++/*

2633

++ * To be deemed as soft real-time, an application must meet two requirements.

2634

++ * First, the application must not require an average bandwidth higher than

2635

++ * the approximate bandwidth required to playback or record a compressed high-

2636

++ * definition video.

2637

++ * The next function is invoked on the completion of the last request of a

2638

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

2639

++ * that, if the next request of the application does not arrive before

2640

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

2641

++ *

2642

++ * The second requirement is that the request pattern of the application is

2643

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

2644

++ * the application stops issuing new requests until all its pending requests

2645

++ * have been completed. After that, the application may issue a new batch,

2646

++ * and so on.

2647

++ * For this reason the next function is invoked to compute soft_rt_next_start

2648

++ * only for applications that meet this requirement, whereas soft_rt_next_start

2649

++ * is set to infinity for applications that do not.

2650

++ *

2651

++ * Unfortunately, even a greedy application may happen to behave in an

2652

++ * isochronous way if the CPU load is high. In fact, the application may stop

2653

++ * issuing requests while the CPUs are busy serving other processes, then

2654

++ * restart, then stop again for a while, and so on. In addition, if the disk

2655

++ * achieves a low enough throughput with the request pattern issued by the

2656

++ * application (e.g., because the request pattern is random and/or the device

2657

++ * is slow), then the application may meet the above bandwidth requirement too.

2658

++ * To prevent such a greedy application to be deemed as soft real-time, a

2659

++ * further rule is used in the computation of soft_rt_next_start:

2660

++ * soft_rt_next_start must be higher than the current time plus the maximum

2661

++ * time for which the arrival of a request is waited for when a sync queue

2662

++ * becomes idle, namely bfqd->bfq_slice_idle.

2663

++ * This filters out greedy applications, as the latter issue instead their next

2664

++ * request as soon as possible after the last one has been completed (in

2665

++ * contrast, when a batch of requests is completed, a soft real-time application

2666

++ * spends some time processing data).

2667

++ *

2668

++ * Unfortunately, the last filter may easily generate false positives if only

2669

++ * bfqd->bfq_slice_idle is used as a reference time interval and one or both

2670

++ * the following cases occur:

2671

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

2672

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

2673

++ *    HZ=100.

2674

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

2675

++ *    for a while, then suddenly 'jump' by several units to recover the lost

2676

++ *    increments. This seems to happen, e.g., inside virtual machines.

2677

++ * To address this issue, we do not use as a reference time interval just

2678

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

2679

++ * particular we add the minimum number of jiffies for which the filter seems

2680

++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

2681

++ */

2682

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

2683

++						       struct bfq_queue *bfqq)

2684

++{

2685

++	return max(bfqq->last_idle_bklogged +

2686

++		   HZ * bfqq->service_from_backlogged /

2687

++		   bfqd->bfq_raising_max_softrt_rate,

2688

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

2689

++}

2690

++

2691

++/*

2692

++ * Return the largest-possible time instant such that, for as long as possible,

2693

++ * the current time will be lower than this time instant according to the macro

2694

++ * time_is_before_jiffies().

2695

++ */

2696

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

2697

++{

2698

++	return now + ULONG_MAX / 2;

2699

++}

2700

++

2701

++/**

2702

++ * bfq_bfqq_expire - expire a queue.

2703

++ * @bfqd: device owning the queue.

2704

++ * @bfqq: the queue to expire.

2705

++ * @compensate: if true, compensate for the time spent idling.

2706

++ * @reason: the reason causing the expiration.

2707

++ *

2708

++ *

2709

++ * If the process associated to the queue is slow (i.e., seeky), or in

2710

++ * case of budget timeout, or, finally, if it is async, we

2711

++ * artificially charge it an entire budget (independently of the

2712

++ * actual service it received). As a consequence, the queue will get

2713

++ * higher timestamps than the correct ones upon reactivation, and

2714

++ * hence it will be rescheduled as if it had received more service

2715

++ * than what it actually received. In the end, this class of processes

2716

++ * will receive less service in proportion to how slowly they consume

2717

++ * their budgets (and hence how seriously they tend to lower the

2718

++ * throughput).

2719

++ *

2720

++ * In contrast, when a queue expires because it has been idling for

2721

++ * too much or because it exhausted its budget, we do not touch the

2722

++ * amount of service it has received. Hence when the queue will be

2723

++ * reactivated and its timestamps updated, the latter will be in sync

2724

++ * with the actual service received by the queue until expiration.

2725

++ *

2726

++ * Charging a full budget to the first type of queues and the exact

2727

++ * service to the others has the effect of using the WF2Q+ policy to

2728

++ * schedule the former on a timeslice basis, without violating the

2729

++ * service domain guarantees of the latter.

2730

++ */

2731

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

2732

++			    struct bfq_queue *bfqq,

2733

++			    int compensate,

2734

++			    enum bfqq_expiration reason)

2735

++{

2736

++	int slow;

2737

++	BUG_ON(bfqq != bfqd->in_service_queue);

2738

++

2739

++	/* Update disk peak rate for autotuning and check whether the

2740

++	 * process is slow (see bfq_update_peak_rate).

2741

++	 */

2742

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

2743

++

2744

++	/*

2745

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

2746

++	 * and async queues, to favor sequential sync workloads.

2747

++	 *

2748

++	 * Processes doing IO in the slower disk zones will tend to be

2749

++	 * slow(er) even if not seeky. Hence, since the estimated peak

2750

++	 * rate is actually an average over the disk surface, these

2751

++	 * processes may timeout just for bad luck. To avoid punishing

2752

++	 * them we do not charge a full budget to a process that

2753

++	 * succeeded in consuming at least 2/3 of its budget.

2754

++	 */

2755

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

2756

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

2757

++		bfq_bfqq_charge_full_budget(bfqq);

2758

++

2759

++	bfqq->service_from_backlogged += bfqq->entity.service;

2760

++

2761

++	if (bfqd->low_latency && bfqq->raising_coeff == 1)

2762

++		bfqq->last_rais_start_finish = jiffies;

2763

++

2764

++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&

2765

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

2766

++		/*

2767

++		 * If we get here, and there are no outstanding requests,

2768

++		 * then the request pattern is isochronous (see the comments

2769

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we can

2770

++		 * compute soft_rt_next_start. If, instead, the queue still

2771

++		 * has outstanding requests, then we have to wait for the

2772

++		 * completion of all the outstanding requests to discover

2773

++		 * whether the request pattern is actually isochronous.

2774

++		 */

2775

++		if (bfqq->dispatched == 0)

2776

++			bfqq->soft_rt_next_start =

2777

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

2778

++		else {

2779

++			/*

2780

++			 * The application is still waiting for the

2781

++			 * completion of one or more requests:

2782

++			 * prevent it from possibly being incorrectly

2783

++			 * deemed as soft real-time by setting its

2784

++			 * soft_rt_next_start to infinity. In fact,

2785

++			 * without this assignment, the application

2786

++			 * would be incorrectly deemed as soft

2787

++			 * real-time if:

2788

++			 * 1) it issued a new request before the

2789

++			 *    completion of all its in-flight

2790

++			 *    requests, and

2791

++			 * 2) at that time, its soft_rt_next_start

2792

++			 *    happened to be in the past.

2793

++			 */

2794

++			bfqq->soft_rt_next_start =

2795

++				bfq_infinity_from_now(jiffies);

2796

++			/*

2797

++			 * Schedule an update of soft_rt_next_start to when

2798

++			 * the task may be discovered to be isochronous.

2799

++			 */

2800

++			bfq_mark_bfqq_softrt_update(bfqq);

2801

++		}

2802

++	}

2803

++

2804

++	bfq_log_bfqq(bfqd, bfqq,

2805

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

2806

++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

2807

++

2808

++	/* Increase, decrease or leave budget unchanged according to reason */

2809

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

2810

++	__bfq_bfqq_expire(bfqd, bfqq);

2811

++}

2812

++

2813

++/*

2814

++ * Budget timeout is not implemented through a dedicated timer, but

2815

++ * just checked on request arrivals and completions, as well as on

2816

++ * idle timer expirations.

2817

++ */

2818

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

2819

++{

2820

++	if (bfq_bfqq_budget_new(bfqq))

2821

++		return 0;

2822

++

2823

++	if (time_before(jiffies, bfqq->budget_timeout))

2824

++		return 0;

2825

++

2826

++	return 1;

2827

++}

2828

++

2829

++/*

2830

++ * If we expire a queue that is waiting for the arrival of a new

2831

++ * request, we may prevent the fictitious timestamp back-shifting that

2832

++ * allows the guarantees of the queue to be preserved (see [1] for

2833

++ * this tricky aspect). Hence we return true only if this condition

2834

++ * does not hold, or if the queue is slow enough to deserve only to be

2835

++ * kicked off for preserving a high throughput.

2836

++*/

2837

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

2838

++{

2839

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

2840

++		"may_budget_timeout: wr %d left %d timeout %d",

2841

++		bfq_bfqq_wait_request(bfqq),

2842

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

2843

++		bfq_bfqq_budget_timeout(bfqq));

2844

++

2845

++	return (!bfq_bfqq_wait_request(bfqq) ||

2846

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

2847

++		&&

2848

++		bfq_bfqq_budget_timeout(bfqq);

2849

++}

2850

++

2851

++/*

2852

++ * For weight-raised queues issuing sync requests, idling is always performed,

2853

++ * as this is instrumental in guaranteeing a high fraction of the throughput

2854

++ * to these queues, and hence in guaranteeing a lower latency for their

2855

++ * requests. See [1] for details.

2856

++ *

2857

++ * For non-weight-raised queues, idling is instead disabled if the device is

2858

++ * NCQ-enabled and non-rotational, as this boosts the throughput on such

2859

++ * devices.

2860

++ */

2861

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

2862

++{

2863

++	struct bfq_data *bfqd = bfqq->bfqd;

2864

++

2865

++	return bfq_bfqq_sync(bfqq) && (

2866

++		bfqq->raising_coeff > 1 ||

2867

++		(bfq_bfqq_idle_window(bfqq) &&

2868

++		 !(bfqd->hw_tag &&

2869

++		   (blk_queue_nonrot(bfqd->queue) ||

2870

++		 /*

2871

++		  * If there are weight-raised busy queues, then do not idle

2872

++		  * the disk for a sync non-weight-raised queue, and hence

2873

++		  * expire the queue immediately if empty. Combined with the

2874

++		  * timestamping rules of BFQ (see [1] for details), this

2875

++		  * causes sync non-weight-raised queues to get a lower

2876

++		  * fraction of the disk throughput, and hence reduces the rate

2877

++		  * at which the processes associated to these queues ask for

2878

++		  * requests from the request pool.

2879

++		  *

2880

++		  * This is beneficial for weight-raised processes, when the

2881

++		  * system operates in request-pool saturation conditions

2882

++		  * (e.g., in the presence of write hogs). In fact, if

2883

++		  * non-weight-raised processes ask for requests at a lower

2884

++		  * rate, then weight-raised processes have a higher

2885

++		  * probability to get a request from the pool immediately

2886

++		  * (or at least soon) when they need one. Hence they have a

2887

++		  * higher probability to actually get a fraction of the disk

2888

++		  * throughput proportional to their high weight. This is

2889

++		  * especially true with NCQ-enabled drives, which enqueue

2890

++		  * several requests in advance and further reorder

2891

++		  * internally-queued requests.

2892

++		  *

2893

++		  * Mistreating non-weight-raised queues in the above-described

2894

++		  * way, when there are busy weight-raised queues, seems to

2895

++		  * mitigate starvation problems in the presence of heavy write

2896

++		  * workloads and NCQ, and hence to guarantee a higher

2897

++		  * application and system responsiveness in these hostile

2898

++		  * scenarios.

2899

++		  */

2900

++		    bfqd->raised_busy_queues > 0)

2901

++		  )

2902

++		)

2903

++	);

2904

++}

2905

++

2906

++/*

2907

++ * If the in-service queue is empty, but it is sync and either of the following

2908

++ * conditions holds, then: 1) the queue must remain in service and cannot be

2909

++ * expired, and 2) the disk must be idled to wait for the possible arrival

2910

++ * of a new request for the queue. The conditions are:

2911

++ * - the device is rotational and not performing NCQ, and the queue has its

2912

++ *   idle window set (in this case, waiting for a new request for the queue

2913

++ *   is likely to boost the disk throughput);

2914

++ * - the queue is weight-raised (waiting for the request is necessary to

2915

++ *   provide the queue with fairness and latency guarantees, see [1] for

2916

++ *   details).

2917

++ */

2918

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

2919

++{

2920

++	struct bfq_data *bfqd = bfqq->bfqd;

2921

++

2922

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

2923

++	       bfq_bfqq_must_not_expire(bfqq) &&

2924

++	       !bfq_queue_nonrot_noidle(bfqd, bfqq);

2925

++}

2926

++

2927

++/*

2928

++ * Select a queue for service.  If we have a current queue in service,

2929

++ * check whether to continue servicing it, or retrieve and set a new one.

2930

++ */

2931

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

2932

++{

2933

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

2934

++	struct request *next_rq;

2935

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

2936

++

2937

++	bfqq = bfqd->in_service_queue;

2938

++	if (bfqq == NULL)

2939

++		goto new_queue;

2940

++

2941

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

2942

++

2943

++	/*

2944

++         * If another queue has a request waiting within our mean seek

2945

++         * distance, let it run. The expire code will check for close

2946

++         * cooperators and put the close queue at the front of the

2947

++         * service tree. If possible, merge the expiring queue with the

2948

++         * new bfqq.

2949

++         */

2950

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

2951

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

2952

++                bfq_setup_merge(bfqq, new_bfqq);

2953

++

2954

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

2955

++	    !timer_pending(&bfqd->idle_slice_timer) &&

2956

++	    !bfq_bfqq_must_idle(bfqq))

2957

++		goto expire;

2958

++

2959

++	next_rq = bfqq->next_rq;

2960

++	/*

2961

++	 * If bfqq has requests queued and it has enough budget left to

2962

++	 * serve them, keep the queue, otherwise expire it.

2963

++	 */

2964

++	if (next_rq != NULL) {

2965

++		if (bfq_serv_to_charge(next_rq, bfqq) >

2966

++			bfq_bfqq_budget_left(bfqq)) {

2967

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

2968

++			goto expire;

2969

++		} else {

2970

++			/*

2971

++			 * The idle timer may be pending because we may not

2972

++			 * disable disk idling even when a new request arrives

2973

++			 */

2974

++			if (timer_pending(&bfqd->idle_slice_timer)) {

2975

++				/*

2976

++				 * If we get here: 1) at least a new request

2977

++				 * has arrived but we have not disabled the

2978

++				 * timer because the request was too small,

2979

++				 * 2) then the block layer has unplugged the

2980

++				 * device, causing the dispatch to be invoked.

2981

++				 *

2982

++				 * Since the device is unplugged, now the

2983

++				 * requests are probably large enough to

2984

++				 * provide a reasonable throughput.

2985

++				 * So we disable idling.

2986

++				 */

2987

++				bfq_clear_bfqq_wait_request(bfqq);

2988

++				del_timer(&bfqd->idle_slice_timer);

2989

++			}

2990

++			if (new_bfqq == NULL)

2991

++				goto keep_queue;

2992

++			else

2993

++				goto expire;

2994

++		}

2995

++	}

2996

++

2997

++	/*

2998

++	 * No requests pending.  If the in-service queue has no cooperator and

2999

++	 * still has requests in flight (possibly waiting for a completion)

3000

++	 * or is idling for a new request, then keep it.

3001

++	 */

3002

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3003

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3004

++		bfqq = NULL;

3005

++		goto keep_queue;

3006

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3007

++		/*

3008

++		 * Expiring the queue because there is a close cooperator,

3009

++		 * cancel timer.

3010

++		 */

3011

++		bfq_clear_bfqq_wait_request(bfqq);

3012

++		del_timer(&bfqd->idle_slice_timer);

3013

++	}

3014

++

3015

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3016

++expire:

3017

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3018

++new_queue:

3019

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3020

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3021

++		bfqq != NULL ? bfqq->pid : 0);

3022

++keep_queue:

3023

++	return bfqq;

3024

++}

3025

++

3026

++static void bfq_update_raising_data(struct bfq_data *bfqd,

3027

++				    struct bfq_queue *bfqq)

3028

++{

3029

++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

3030

++		struct bfq_entity *entity = &bfqq->entity;

3031

++

3032

++		bfq_log_bfqq(bfqd, bfqq,

3033

++			"raising period dur %u/%u msec, "

3034

++			"old raising coeff %u, w %d(%d)",

3035

++			jiffies_to_msecs(jiffies -

3036

++				bfqq->last_rais_start_finish),

3037

++			jiffies_to_msecs(bfqq->raising_cur_max_time),

3038

++			bfqq->raising_coeff,

3039

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3040

++

3041

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3042

++			entity->orig_weight * bfqq->raising_coeff);

3043

++		if (entity->ioprio_changed)

3044

++			bfq_log_bfqq(bfqd, bfqq,

3045

++			"WARN: pending prio change");

3046

++		/*

3047

++		 * If too much time has elapsed from the beginning

3048

++		 * of this weight-raising, stop it.

3049

++		 */

3050

++		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

3051

++					   bfqq->raising_cur_max_time)) {

3052

++			bfqq->last_rais_start_finish = jiffies;

3053

++			bfq_log_bfqq(bfqd, bfqq,

3054

++				     "wrais ending at %lu, "

3055

++				     "rais_max_time %u",

3056

++				     bfqq->last_rais_start_finish,

3057

++				     jiffies_to_msecs(bfqq->

3058

++					raising_cur_max_time));

3059

++			bfq_bfqq_end_raising(bfqq);

3060

++			__bfq_entity_update_weight_prio(

3061

++				bfq_entity_service_tree(entity),

3062

++				entity);

3063

++		}

3064

++	}

3065

++}

3066

++

3067

++/*

3068

++ * Dispatch one request from bfqq, moving it to the request queue

3069

++ * dispatch list.

3070

++ */

3071

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3072

++				struct bfq_queue *bfqq)

3073

++{

3074

++	int dispatched = 0;

3075

++	struct request *rq;

3076

++	unsigned long service_to_charge;

3077

++

3078

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3079

++

3080

++	/* Follow expired path, else get first next available. */

3081

++	rq = bfq_check_fifo(bfqq);

3082

++	if (rq == NULL)

3083

++		rq = bfqq->next_rq;

3084

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3085

++

3086

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3087

++		/*

3088

++		 * This may happen if the next rq is chosen

3089

++		 * in fifo order instead of sector order.

3090

++		 * The budget is properly dimensioned

3091

++		 * to be always sufficient to serve the next request

3092

++		 * only if it is chosen in sector order. The reason is

3093

++		 * that it would be quite inefficient and little useful

3094

++		 * to always make sure that the budget is large enough

3095

++		 * to serve even the possible next rq in fifo order.

3096

++		 * In fact, requests are seldom served in fifo order.

3097

++		 *

3098

++		 * Expire the queue for budget exhaustion, and

3099

++		 * make sure that the next act_budget is enough

3100

++		 * to serve the next request, even if it comes

3101

++		 * from the fifo expired path.

3102

++		 */

3103

++		bfqq->next_rq = rq;

3104

++		/*

3105

++		 * Since this dispatch is failed, make sure that

3106

++		 * a new one will be performed

3107

++		 */

3108

++		if (!bfqd->rq_in_driver)

3109

++			bfq_schedule_dispatch(bfqd);

3110

++		goto expire;

3111

++	}

3112

++

3113

++	/* Finally, insert request into driver dispatch list. */

3114

++	bfq_bfqq_served(bfqq, service_to_charge);

3115

++	bfq_dispatch_insert(bfqd->queue, rq);

3116

++

3117

++	bfq_update_raising_data(bfqd, bfqq);

3118

++

3119

++	bfq_log_bfqq(bfqd, bfqq,

3120

++			"dispatched %u sec req (%llu), budg left %lu",

3121

++			blk_rq_sectors(rq),

3122

++			(long long unsigned)blk_rq_pos(rq),

3123

++			bfq_bfqq_budget_left(bfqq));

3124

++

3125

++	dispatched++;

3126

++

3127

++	if (bfqd->in_service_bic == NULL) {

3128

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3129

++		bfqd->in_service_bic = RQ_BIC(rq);

3130

++	}

3131

++

3132

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3133

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3134

++	    bfq_class_idle(bfqq)))

3135

++		goto expire;

3136

++

3137

++	return dispatched;

3138

++

3139

++expire:

3140

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3141

++	return dispatched;

3142

++}

3143

++

3144

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3145

++{

3146

++	int dispatched = 0;

3147

++

3148

++	while (bfqq->next_rq != NULL) {

3149

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3150

++		dispatched++;

3151

++	}

3152

++

3153

++	BUG_ON(!list_empty(&bfqq->fifo));

3154

++	return dispatched;

3155

++}

3156

++

3157

++/*

3158

++ * Drain our current requests.  Used for barriers and when switching

3159

++ * io schedulers on-the-fly.

3160

++ */

3161

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3162

++{

3163

++	struct bfq_queue *bfqq, *n;

3164

++	struct bfq_service_tree *st;

3165

++	int dispatched = 0;

3166

++

3167

++	bfqq = bfqd->in_service_queue;

3168

++	if (bfqq != NULL)

3169

++		__bfq_bfqq_expire(bfqd, bfqq);

3170

++

3171

++	/*

3172

++	 * Loop through classes, and be careful to leave the scheduler

3173

++	 * in a consistent state, as feedback mechanisms and vtime

3174

++	 * updates cannot be disabled during the process.

3175

++	 */

3176

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3177

++		st = bfq_entity_service_tree(&bfqq->entity);

3178

++

3179

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3180

++		bfqq->max_budget = bfq_max_budget(bfqd);

3181

++

3182

++		bfq_forget_idle(st);

3183

++	}

3184

++

3185

++	BUG_ON(bfqd->busy_queues != 0);

3186

++

3187

++	return dispatched;

3188

++}

3189

++

3190

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3191

++{

3192

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3193

++	struct bfq_queue *bfqq;

3194

++	int max_dispatch;

3195

++

3196

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3197

++	if (bfqd->busy_queues == 0)

3198

++		return 0;

3199

++

3200

++	if (unlikely(force))

3201

++		return bfq_forced_dispatch(bfqd);

3202

++

3203

++	bfqq = bfq_select_queue(bfqd);

3204

++	if (bfqq == NULL)

3205

++		return 0;

3206

++

3207

++	max_dispatch = bfqd->bfq_quantum;

3208

++	if (bfq_class_idle(bfqq))

3209

++		max_dispatch = 1;

3210

++

3211

++	if (!bfq_bfqq_sync(bfqq))

3212

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3213

++

3214

++	if (bfqq->dispatched >= max_dispatch) {

3215

++		if (bfqd->busy_queues > 1)

3216

++			return 0;

3217

++		if (bfqq->dispatched >= 4 * max_dispatch)

3218

++			return 0;

3219

++	}

3220

++

3221

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3222

++		return 0;

3223

++

3224

++	bfq_clear_bfqq_wait_request(bfqq);

3225

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3226

++

3227

++	if (!bfq_dispatch_request(bfqd, bfqq))

3228

++		return 0;

3229

++

3230

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

3231

++			bfqq->pid, max_dispatch);

3232

++

3233

++	return 1;

3234

++}

3235

++

3236

++/*

3237

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3238

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3239

++ *

3240

++ * Queue lock must be held here.

3241

++ */

3242

++static void bfq_put_queue(struct bfq_queue *bfqq)

3243

++{

3244

++	struct bfq_data *bfqd = bfqq->bfqd;

3245

++

3246

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3247

++

3248

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3249

++		     atomic_read(&bfqq->ref));

3250

++	if (!atomic_dec_and_test(&bfqq->ref))

3251

++		return;

3252

++

3253

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3254

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3255

++	BUG_ON(bfqq->entity.tree != NULL);

3256

++	BUG_ON(bfq_bfqq_busy(bfqq));

3257

++	BUG_ON(bfqd->in_service_queue == bfqq);

3258

++

3259

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3260

++

3261

++	kmem_cache_free(bfq_pool, bfqq);

3262

++}

3263

++

3264

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

3265

++{

3266

++	struct bfq_queue *__bfqq, *next;

3267

++

3268

++	/*

3269

++	 * If this queue was scheduled to merge with another queue, be

3270

++	 * sure to drop the reference taken on that queue (and others in

3271

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3272

++	 */

3273

++	__bfqq = bfqq->new_bfqq;

3274

++	while (__bfqq) {

3275

++		if (__bfqq == bfqq) {

3276

++			WARN(1, "bfqq->new_bfqq loop detected.\n");

3277

++			break;

3278

++		}

3279

++		next = __bfqq->new_bfqq;

3280

++		bfq_put_queue(__bfqq);

3281

++		__bfqq = next;

3282

++	}

3283

++}

3284

++

3285

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3286

++{

3287

++	if (bfqq == bfqd->in_service_queue) {

3288

++		__bfq_bfqq_expire(bfqd, bfqq);

3289

++		bfq_schedule_dispatch(bfqd);

3290

++	}

3291

++

3292

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3293

++		     atomic_read(&bfqq->ref));

3294

++

3295

++	bfq_put_cooperator(bfqq);

3296

++

3297

++	bfq_put_queue(bfqq);

3298

++}

3299

++

3300

++static void bfq_init_icq(struct io_cq *icq)

3301

++{

3302

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3303

++

3304

++	bic->ttime.last_end_request = jiffies;

3305

++}

3306

++

3307

++static void bfq_exit_icq(struct io_cq *icq)

3308

++{

3309

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3310

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3311

++

3312

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3313

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3314

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3315

++	}

3316

++

3317

++	if (bic->bfqq[BLK_RW_SYNC]) {

3318

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3319

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3320

++	}

3321

++}

3322

++

3323

++/*

3324

++ * Update the entity prio values; note that the new values will not

3325

++ * be used until the next (re)activation.

3326

++ */

3327

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3328

++{

3329

++	struct task_struct *tsk = current;

3330

++	int ioprio_class;

3331

++

3332

++	if (!bfq_bfqq_prio_changed(bfqq))

3333

++		return;

3334

++

3335

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3336

++	switch (ioprio_class) {

3337

++	default:

3338

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3339

++			"bfq: bad prio %x\n", ioprio_class);

3340

++	case IOPRIO_CLASS_NONE:

3341

++		/*

3342

++		 * No prio set, inherit CPU scheduling settings.

3343

++		 */

3344

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3345

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3346

++		break;

3347

++	case IOPRIO_CLASS_RT:

3348

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3349

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3350

++		break;

3351

++	case IOPRIO_CLASS_BE:

3352

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3353

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3354

++		break;

3355

++	case IOPRIO_CLASS_IDLE:

3356

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3357

++		bfqq->entity.new_ioprio = 7;

3358

++		bfq_clear_bfqq_idle_window(bfqq);

3359

++		break;

3360

++	}

3361

++

3362

++	bfqq->entity.ioprio_changed = 1;

3363

++

3364

++	/*

3365

++	 * Keep track of original prio settings in case we have to temporarily

3366

++	 * elevate the priority of this queue.

3367

++	 */

3368

++	bfqq->org_ioprio = bfqq->entity.new_ioprio;

3369

++	bfq_clear_bfqq_prio_changed(bfqq);

3370

++}

3371

++

3372

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3373

++{

3374

++	struct bfq_data *bfqd;

3375

++	struct bfq_queue *bfqq, *new_bfqq;

3376

++	struct bfq_group *bfqg;

3377

++	unsigned long uninitialized_var(flags);

3378

++	int ioprio = bic->icq.ioc->ioprio;

3379

++

3380

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3381

++				   &flags);

3382

++	/*

3383

++	 * This condition may trigger on a newly created bic, be sure to drop

3384

++	 * the lock before returning.

3385

++	 */

3386

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3387

++		goto out;

3388

++

3389

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

3390

++	if (bfqq != NULL) {

3391

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3392

++				    sched_data);

3393

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3394

++					 GFP_ATOMIC);

3395

++		if (new_bfqq != NULL) {

3396

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3397

++			bfq_log_bfqq(bfqd, bfqq,

3398

++				     "changed_ioprio: bfqq %p %d",

3399

++				     bfqq, atomic_read(&bfqq->ref));

3400

++			bfq_put_queue(bfqq);

3401

++		}

3402

++	}

3403

++

3404

++	bfqq = bic->bfqq[BLK_RW_SYNC];

3405

++	if (bfqq != NULL)

3406

++		bfq_mark_bfqq_prio_changed(bfqq);

3407

++

3408

++	bic->ioprio = ioprio;

3409

++

3410

++out:

3411

++	bfq_put_bfqd_unlock(bfqd, &flags);

3412

++}

3413

++

3414

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3415

++			  pid_t pid, int is_sync)

3416

++{

3417

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3418

++	INIT_LIST_HEAD(&bfqq->fifo);

3419

++

3420

++	atomic_set(&bfqq->ref, 0);

3421

++	bfqq->bfqd = bfqd;

3422

++

3423

++	bfq_mark_bfqq_prio_changed(bfqq);

3424

++

3425

++	if (is_sync) {

3426

++		if (!bfq_class_idle(bfqq))

3427

++			bfq_mark_bfqq_idle_window(bfqq);

3428

++		bfq_mark_bfqq_sync(bfqq);

3429

++	}

3430

++

3431

++	/* Tentative initial value to trade off between thr and lat */

3432

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3433

++	bfqq->pid = pid;

3434

++

3435

++	bfqq->raising_coeff = 1;

3436

++	bfqq->last_rais_start_finish = 0;

3437

++	/*

3438

++	 * Set to the value for which bfqq will not be deemed as

3439

++	 * soft rt when it becomes backlogged.

3440

++	 */

3441

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

3442

++}

3443

++

3444

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

3445

++					      struct bfq_group *bfqg,

3446

++					      int is_sync,

3447

++					      struct bfq_io_cq *bic,

3448

++					      gfp_t gfp_mask)

3449

++{

3450

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3451

++

3452

++retry:

3453

++	/* bic always exists here */

3454

++	bfqq = bic_to_bfqq(bic, is_sync);

3455

++

3456

++	/*

3457

++	 * Always try a new alloc if we fall back to the OOM bfqq

3458

++	 * originally, since it should just be a temporary situation.

3459

++	 */

3460

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3461

++		bfqq = NULL;

3462

++		if (new_bfqq != NULL) {

3463

++			bfqq = new_bfqq;

3464

++			new_bfqq = NULL;

3465

++		} else if (gfp_mask & __GFP_WAIT) {

3466

++			spin_unlock_irq(bfqd->queue->queue_lock);

3467

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

3468

++					gfp_mask | __GFP_ZERO,

3469

++					bfqd->queue->node);

3470

++			spin_lock_irq(bfqd->queue->queue_lock);

3471

++			if (new_bfqq != NULL)

3472

++				goto retry;

3473

++		} else {

3474

++			bfqq = kmem_cache_alloc_node(bfq_pool,

3475

++					gfp_mask | __GFP_ZERO,

3476

++					bfqd->queue->node);

3477

++		}

3478

++

3479

++		if (bfqq != NULL) {

3480

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

3481

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

3482

++		} else {

3483

++			bfqq = &bfqd->oom_bfqq;

3484

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

3485

++		}

3486

++

3487

++		bfq_init_prio_data(bfqq, bic);

3488

++		bfq_init_entity(&bfqq->entity, bfqg);

3489

++	}

3490

++

3491

++	if (new_bfqq != NULL)

3492

++		kmem_cache_free(bfq_pool, new_bfqq);

3493

++

3494

++	return bfqq;

3495

++}

3496

++

3497

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

3498

++					       struct bfq_group *bfqg,

3499

++					       int ioprio_class, int ioprio)

3500

++{

3501

++	switch (ioprio_class) {

3502

++	case IOPRIO_CLASS_RT:

3503

++		return &bfqg->async_bfqq[0][ioprio];

3504

++	case IOPRIO_CLASS_NONE:

3505

++		ioprio = IOPRIO_NORM;

3506

++		/* fall through */

3507

++	case IOPRIO_CLASS_BE:

3508

++		return &bfqg->async_bfqq[1][ioprio];

3509

++	case IOPRIO_CLASS_IDLE:

3510

++		return &bfqg->async_idle_bfqq;

3511

++	default:

3512

++		BUG();

3513

++	}

3514

++}

3515

++

3516

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

3517

++				       struct bfq_group *bfqg, int is_sync,

3518

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

3519

++{

3520

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3521

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3522

++	struct bfq_queue **async_bfqq = NULL;

3523

++	struct bfq_queue *bfqq = NULL;

3524

++

3525

++	if (!is_sync) {

3526

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

3527

++						  ioprio);

3528

++		bfqq = *async_bfqq;

3529

++	}

3530

++

3531

++	if (bfqq == NULL)

3532

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3533

++

3534

++	/*

3535

++	 * Pin the queue now that it's allocated, scheduler exit will prune it.

3536

++	 */

3537

++	if (!is_sync && *async_bfqq == NULL) {

3538

++		atomic_inc(&bfqq->ref);

3539

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

3540

++			     bfqq, atomic_read(&bfqq->ref));

3541

++		*async_bfqq = bfqq;

3542

++	}

3543

++

3544

++	atomic_inc(&bfqq->ref);

3545

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

3546

++		     atomic_read(&bfqq->ref));

3547

++	return bfqq;

3548

++}

3549

++

3550

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

3551

++				    struct bfq_io_cq *bic)

3552

++{

3553

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

3554

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

3555

++

3556

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

3557

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

3558

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

3559

++				bic->ttime.ttime_samples;

3560

++}

3561

++

3562

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

3563

++				   struct bfq_queue *bfqq,

3564

++				   struct request *rq)

3565

++{

3566

++	sector_t sdist;

3567

++	u64 total;

3568

++

3569

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

3570

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

3571

++	else

3572

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

3573

++

3574

++	/*

3575

++	 * Don't allow the seek distance to get too large from the

3576

++	 * odd fragment, pagein, etc.

3577

++	 */

3578

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

3579

++		sdist = 0;

3580

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

3581

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

3582

++	else

3583

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

3584

++

3585

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

3586

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

3587

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

3588

++	do_div(total, bfqq->seek_samples);

3589

++	bfqq->seek_mean = (sector_t)total;

3590

++

3591

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

3592

++			(u64)bfqq->seek_mean);

3593

++}

3594

++

3595

++/*

3596

++ * Disable idle window if the process thinks too long or seeks so much that

3597

++ * it doesn't matter.

3598

++ */

3599

++static void bfq_update_idle_window(struct bfq_data *bfqd,

3600

++				   struct bfq_queue *bfqq,

3601

++				   struct bfq_io_cq *bic)

3602

++{

3603

++	int enable_idle;

3604

++

3605

++	/* Don't idle for async or idle io prio class. */

3606

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

3607

++		return;

3608

++

3609

++	enable_idle = bfq_bfqq_idle_window(bfqq);

3610

++

3611

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

3612

++	    bfqd->bfq_slice_idle == 0 ||

3613

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

3614

++			bfqq->raising_coeff == 1))

3615

++		enable_idle = 0;

3616

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

3617

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

3618

++			bfqq->raising_coeff == 1)

3619

++			enable_idle = 0;

3620

++		else

3621

++			enable_idle = 1;

3622

++	}

3623

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

3624

++		enable_idle);

3625

++

3626

++	if (enable_idle)

3627

++		bfq_mark_bfqq_idle_window(bfqq);

3628

++	else

3629

++		bfq_clear_bfqq_idle_window(bfqq);

3630

++}

3631

++

3632

++/*

3633

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

3634

++ * something we should do about it.

3635

++ */

3636

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3637

++			    struct request *rq)

3638

++{

3639

++	struct bfq_io_cq *bic = RQ_BIC(rq);

3640

++

3641

++	if (rq->cmd_flags & REQ_META)

3642

++		bfqq->meta_pending++;

3643

++

3644

++	bfq_update_io_thinktime(bfqd, bic);

3645

++	bfq_update_io_seektime(bfqd, bfqq, rq);

3646

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

3647

++	    !BFQQ_SEEKY(bfqq))

3648

++		bfq_update_idle_window(bfqd, bfqq, bic);

3649

++

3650

++	bfq_log_bfqq(bfqd, bfqq,

3651

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

3652

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

3653

++		     (long long unsigned)bfqq->seek_mean);

3654

++

3655

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3656

++

3657

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

3658

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

3659

++				blk_rq_sectors(rq) < 32;

3660

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

3661

++

3662

++		/*

3663

++		 * There is just this request queued: if the request

3664

++		 * is small and the queue is not to be expired, then

3665

++		 * just exit.

3666

++		 *

3667

++		 * In this way, if the disk is being idled to wait for

3668

++		 * a new request from the in-service queue, we avoid

3669

++		 * unplugging the device and committing the disk to serve

3670

++		 * just a small request. On the contrary, we wait for

3671

++		 * the block layer to decide when to unplug the device:

3672

++		 * hopefully, new requests will be merged to this one

3673

++		 * quickly, then the device will be unplugged and

3674

++		 * larger requests will be dispatched.

3675

++		 */

3676

++		if (small_req && !budget_timeout)

3677

++			return;

3678

++

3679

++		/*

3680

++		 * A large enough request arrived, or the queue is to

3681

++		 * be expired: in both cases disk idling is to be

3682

++		 * stopped, so clear wait_request flag and reset

3683

++		 * timer.

3684

++		 */

3685

++		bfq_clear_bfqq_wait_request(bfqq);

3686

++		del_timer(&bfqd->idle_slice_timer);

3687

++

3688

++		/*

3689

++		 * The queue is not empty, because a new request just

3690

++		 * arrived. Hence we can safely expire the queue, in

3691

++		 * case of budget timeout, without risking that the

3692

++		 * timestamps of the queue are not updated correctly.

3693

++		 * See [1] for more details.

3694

++		 */

3695

++		if (budget_timeout)

3696

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3697

++

3698

++		/*

3699

++		 * Let the request rip immediately, or let a new queue be

3700

++		 * selected if bfqq has just been expired.

3701

++		 */

3702

++		__blk_run_queue(bfqd->queue);

3703

++	}

3704

++}

3705

++

3706

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

3707

++{

3708

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3709

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3710

++

3711

++	assert_spin_locked(bfqd->queue->queue_lock);

3712

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

3713

++

3714

++	bfq_add_rq_rb(rq);

3715

++

3716

++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

3717

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

3718

++

3719

++	bfq_rq_enqueued(bfqd, bfqq, rq);

3720

++}

3721

++

3722

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

3723

++{

3724

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

3725

++				     bfqd->rq_in_driver);

3726

++

3727

++	if (bfqd->hw_tag == 1)

3728

++		return;

3729

++

3730

++	/*

3731

++	 * This sample is valid if the number of outstanding requests

3732

++	 * is large enough to allow a queueing behavior.  Note that the

3733

++	 * sum is not exact, as it's not taking into account deactivated

3734

++	 * requests.

3735

++	 */

3736

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

3737

++		return;

3738

++

3739

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

3740

++		return;

3741

++

3742

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

3743

++	bfqd->max_rq_in_driver = 0;

3744

++	bfqd->hw_tag_samples = 0;

3745

++}

3746

++

3747

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

3748

++{

3749

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3750

++	struct bfq_data *bfqd = bfqq->bfqd;

3751

++	const int sync = rq_is_sync(rq);

3752

++

3753

++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

3754

++			blk_rq_sectors(rq), sync);

3755

++

3756

++	bfq_update_hw_tag(bfqd);

3757

++

3758

++	WARN_ON(!bfqd->rq_in_driver);

3759

++	WARN_ON(!bfqq->dispatched);

3760

++	bfqd->rq_in_driver--;

3761

++	bfqq->dispatched--;

3762

++

3763

++	if (bfq_bfqq_sync(bfqq))

3764

++		bfqd->sync_flight--;

3765

++

3766

++	if (sync)

3767

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

3768

++

3769

++	/*

3770

++	 * If we are waiting to discover whether the request pattern of the

3771

++	 * task associated with the queue is actually isochronous, and

3772

++	 * both requisites for this condition to hold are satisfied, then

3773

++	 * compute soft_rt_next_start (see the comments to the function

3774

++	 * bfq_bfqq_softrt_next_start()).

3775

++	 */

3776

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

3777

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

3778

++		bfqq->soft_rt_next_start =

3779

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

3780

++

3781

++	/*

3782

++	 * If this is the in-service queue, check if it needs to be expired,

3783

++	 * or if we want to idle in case it has no pending requests.

3784

++	 */

3785

++	if (bfqd->in_service_queue == bfqq) {

3786

++		if (bfq_bfqq_budget_new(bfqq))

3787

++			bfq_set_budget_timeout(bfqd);

3788

++

3789

++		if (bfq_bfqq_must_idle(bfqq)) {

3790

++			bfq_arm_slice_timer(bfqd);

3791

++			goto out;

3792

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

3793

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3794

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

3795

++			 (bfqq->dispatched == 0 ||

3796

++			  !bfq_bfqq_must_not_expire(bfqq)))

3797

++			bfq_bfqq_expire(bfqd, bfqq, 0,

3798

++					BFQ_BFQQ_NO_MORE_REQUESTS);

3799

++	}

3800

++

3801

++	if (!bfqd->rq_in_driver)

3802

++		bfq_schedule_dispatch(bfqd);

3803

++

3804

++out:

3805

++	return;

3806

++}

3807

++

3808

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

3809

++{

3810

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

3811

++		bfq_clear_bfqq_must_alloc(bfqq);

3812

++		return ELV_MQUEUE_MUST;

3813

++	}

3814

++

3815

++	return ELV_MQUEUE_MAY;

3816

++}

3817

++

3818

++static int bfq_may_queue(struct request_queue *q, int rw)

3819

++{

3820

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3821

++	struct task_struct *tsk = current;

3822

++	struct bfq_io_cq *bic;

3823

++	struct bfq_queue *bfqq;

3824

++

3825

++	/*

3826

++	 * Don't force setup of a queue from here, as a call to may_queue

3827

++	 * does not necessarily imply that a request actually will be queued.

3828

++	 * So just lookup a possibly existing queue, or return 'may queue'

3829

++	 * if that fails.

3830

++	 */

3831

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

3832

++	if (bic == NULL)

3833

++		return ELV_MQUEUE_MAY;

3834

++

3835

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

3836

++	if (bfqq != NULL) {

3837

++		bfq_init_prio_data(bfqq, bic);

3838

++

3839

++		return __bfq_may_queue(bfqq);

3840

++	}

3841

++

3842

++	return ELV_MQUEUE_MAY;

3843

++}

3844

++

3845

++/*

3846

++ * Queue lock held here.

3847

++ */

3848

++static void bfq_put_request(struct request *rq)

3849

++{

3850

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3851

++

3852

++	if (bfqq != NULL) {

3853

++		const int rw = rq_data_dir(rq);

3854

++

3855

++		BUG_ON(!bfqq->allocated[rw]);

3856

++		bfqq->allocated[rw]--;

3857

++

3858

++		rq->elv.priv[0] = NULL;

3859

++		rq->elv.priv[1] = NULL;

3860

++

3861

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

3862

++			     bfqq, atomic_read(&bfqq->ref));

3863

++		bfq_put_queue(bfqq);

3864

++	}

3865

++}

3866

++

3867

++static struct bfq_queue *

3868

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

3869

++		struct bfq_queue *bfqq)

3870

++{

3871

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

3872

++		(long unsigned)bfqq->new_bfqq->pid);

3873

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

3874

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

3875

++	bfq_put_queue(bfqq);

3876

++	return bic_to_bfqq(bic, 1);

3877

++}

3878

++

3879

++/*

3880

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

3881

++ * was the last process referring to said bfqq.

3882

++ */

3883

++static struct bfq_queue *

3884

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

3885

++{

3886

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

3887

++	if (bfqq_process_refs(bfqq) == 1) {

3888

++		bfqq->pid = current->pid;

3889

++		bfq_clear_bfqq_coop(bfqq);

3890

++		bfq_clear_bfqq_split_coop(bfqq);

3891

++		return bfqq;

3892

++	}

3893

++

3894

++	bic_set_bfqq(bic, NULL, 1);

3895

++

3896

++	bfq_put_cooperator(bfqq);

3897

++

3898

++	bfq_put_queue(bfqq);

3899

++	return NULL;

3900

++}

3901

++

3902

++/*

3903

++ * Allocate bfq data structures associated with this request.

3904

++ */

3905

++static int bfq_set_request(struct request_queue *q, struct request *rq,

3906

++			   struct bio *bio, gfp_t gfp_mask)

3907

++{

3908

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3909

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

3910

++	const int rw = rq_data_dir(rq);

3911

++	const int is_sync = rq_is_sync(rq);

3912

++	struct bfq_queue *bfqq;

3913

++	struct bfq_group *bfqg;

3914

++	unsigned long flags;

3915

++

3916

++	might_sleep_if(gfp_mask & __GFP_WAIT);

3917

++

3918

++	bfq_changed_ioprio(bic);

3919

++

3920

++	spin_lock_irqsave(q->queue_lock, flags);

3921

++

3922

++	if (bic == NULL)

3923

++		goto queue_fail;

3924

++

3925

++	bfqg = bfq_bic_update_cgroup(bic);

3926

++

3927

++new_queue:

3928

++	bfqq = bic_to_bfqq(bic, is_sync);

3929

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3930

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3931

++		bic_set_bfqq(bic, bfqq, is_sync);

3932

++	} else {

3933

++		/*

3934

++		 * If the queue was seeky for too long, break it apart.

3935

++		 */

3936

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

3937

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

3938

++			bfqq = bfq_split_bfqq(bic, bfqq);

3939

++			if (!bfqq)

3940

++				goto new_queue;

3941

++		}

3942

++

3943

++		/*

3944

++		 * Check to see if this queue is scheduled to merge with

3945

++		 * another closely cooperating queue. The merging of queues

3946

++		 * happens here as it must be done in process context.

3947

++		 * The reference on new_bfqq was taken in merge_bfqqs.

3948

++		 */

3949

++		if (bfqq->new_bfqq != NULL)

3950

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

3951

++	}

3952

++

3953

++	bfqq->allocated[rw]++;

3954

++	atomic_inc(&bfqq->ref);

3955

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

3956

++		     atomic_read(&bfqq->ref));

3957

++

3958

++	rq->elv.priv[0] = bic;

3959

++	rq->elv.priv[1] = bfqq;

3960

++

3961

++	spin_unlock_irqrestore(q->queue_lock, flags);

3962

++

3963

++	return 0;

3964

++

3965

++queue_fail:

3966

++	bfq_schedule_dispatch(bfqd);

3967

++	spin_unlock_irqrestore(q->queue_lock, flags);

3968

++

3969

++	return 1;

3970

++}

3971

++

3972

++static void bfq_kick_queue(struct work_struct *work)

3973

++{

3974

++	struct bfq_data *bfqd =

3975

++		container_of(work, struct bfq_data, unplug_work);

3976

++	struct request_queue *q = bfqd->queue;

3977

++

3978

++	spin_lock_irq(q->queue_lock);

3979

++	__blk_run_queue(q);

3980

++	spin_unlock_irq(q->queue_lock);

3981

++}

3982

++

3983

++/*

3984

++ * Handler of the expiration of the timer running if the in-service queue

3985

++ * is idling inside its time slice.

3986

++ */

3987

++static void bfq_idle_slice_timer(unsigned long data)

3988

++{

3989

++	struct bfq_data *bfqd = (struct bfq_data *)data;

3990

++	struct bfq_queue *bfqq;

3991

++	unsigned long flags;

3992

++	enum bfqq_expiration reason;

3993

++

3994

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

3995

++

3996

++	bfqq = bfqd->in_service_queue;

3997

++	/*

3998

++	 * Theoretical race here: the in-service queue can be NULL or different

3999

++	 * from the queue that was idling if the timer handler spins on

4000

++	 * the queue_lock and a new request arrives for the current

4001

++	 * queue and there is a full dispatch cycle that changes the

4002

++	 * in-service queue.  This can hardly happen, but in the worst case

4003

++	 * we just expire a queue too early.

4004

++	 */

4005

++	if (bfqq != NULL) {

4006

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4007

++		if (bfq_bfqq_budget_timeout(bfqq))

4008

++			/*

4009

++			 * Also here the queue can be safely expired

4010

++			 * for budget timeout without wasting

4011

++			 * guarantees

4012

++			 */

4013

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4014

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4015

++			/*

4016

++			 * The queue may not be empty upon timer expiration,

4017

++			 * because we may not disable the timer when the first

4018

++			 * request of the in-service queue arrives during

4019

++			 * disk idling

4020

++			 */

4021

++			reason = BFQ_BFQQ_TOO_IDLE;

4022

++		else

4023

++			goto schedule_dispatch;

4024

++

4025

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4026

++	}

4027

++

4028

++schedule_dispatch:

4029

++	bfq_schedule_dispatch(bfqd);

4030

++

4031

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4032

++}

4033

++

4034

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4035

++{

4036

++	del_timer_sync(&bfqd->idle_slice_timer);

4037

++	cancel_work_sync(&bfqd->unplug_work);

4038

++}

4039

++

4040

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4041

++					struct bfq_queue **bfqq_ptr)

4042

++{

4043

++	struct bfq_group *root_group = bfqd->root_group;

4044

++	struct bfq_queue *bfqq = *bfqq_ptr;

4045

++

4046

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4047

++	if (bfqq != NULL) {

4048

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4049

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4050

++			     bfqq, atomic_read(&bfqq->ref));

4051

++		bfq_put_queue(bfqq);

4052

++		*bfqq_ptr = NULL;

4053

++	}

4054

++}

4055

++

4056

++/*

4057

++ * Release all the bfqg references to its async queues.  If we are

4058

++ * deallocating the group these queues may still contain requests, so

4059

++ * we reparent them to the root cgroup (i.e., the only one that will

4060

++ * exist for sure until all the requests on a device are gone).

4061

++ */

4062

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4063

++{

4064

++	int i, j;

4065

++

4066

++	for (i = 0; i < 2; i++)

4067

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4068

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4069

++

4070

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4071

++}

4072

++

4073

++static void bfq_exit_queue(struct elevator_queue *e)

4074

++{

4075

++	struct bfq_data *bfqd = e->elevator_data;

4076

++	struct request_queue *q = bfqd->queue;

4077

++	struct bfq_queue *bfqq, *n;

4078

++

4079

++	bfq_shutdown_timer_wq(bfqd);

4080

++

4081

++	spin_lock_irq(q->queue_lock);

4082

++

4083

++	BUG_ON(bfqd->in_service_queue != NULL);

4084

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4085

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4086

++

4087

++	bfq_disconnect_groups(bfqd);

4088

++	spin_unlock_irq(q->queue_lock);

4089

++

4090

++	bfq_shutdown_timer_wq(bfqd);

4091

++

4092

++	synchronize_rcu();

4093

++

4094

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4095

++

4096

++	bfq_free_root_group(bfqd);

4097

++	kfree(bfqd);

4098

++}

4099

++

4100

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4101

++{

4102

++	struct bfq_group *bfqg;

4103

++	struct bfq_data *bfqd;

4104

++	struct elevator_queue *eq;

4105

++

4106

++	eq = elevator_alloc(q, e);

4107

++	if (eq == NULL)

4108

++		return -ENOMEM;

4109

++

4110

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4111

++	if (bfqd == NULL) {

4112

++		kobject_put(&eq->kobj);

4113

++		return -ENOMEM;

4114

++	}

4115

++	eq->elevator_data = bfqd;

4116

++

4117

++	/*

4118

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4119

++	 * Grab a permanent reference to it, so that the normal code flow

4120

++	 * will not attempt to free it.

4121

++	 */

4122

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

4123

++	atomic_inc(&bfqd->oom_bfqq.ref);

4124

++

4125

++	bfqd->queue = q;

4126

++

4127

++	spin_lock_irq(q->queue_lock);

4128

++	q->elevator = eq;

4129

++	spin_unlock_irq(q->queue_lock);

4130

++

4131

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

4132

++	if (bfqg == NULL) {

4133

++		kfree(bfqd);

4134

++		kobject_put(&eq->kobj);

4135

++		return -ENOMEM;

4136

++	}

4137

++

4138

++	bfqd->root_group = bfqg;

4139

++

4140

++	init_timer(&bfqd->idle_slice_timer);

4141

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4142

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4143

++

4144

++	bfqd->rq_pos_tree = RB_ROOT;

4145

++

4146

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4147

++

4148

++	INIT_LIST_HEAD(&bfqd->active_list);

4149

++	INIT_LIST_HEAD(&bfqd->idle_list);

4150

++

4151

++	bfqd->hw_tag = -1;

4152

++

4153

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4154

++

4155

++	bfqd->bfq_quantum = bfq_quantum;

4156

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4157

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4158

++	bfqd->bfq_back_max = bfq_back_max;

4159

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4160

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4161

++	bfqd->bfq_class_idle_last_service = 0;

4162

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4163

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4164

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4165

++

4166

++	bfqd->low_latency = true;

4167

++

4168

++	bfqd->bfq_raising_coeff = 20;

4169

++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

4170

++	bfqd->bfq_raising_max_time = 0;

4171

++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

4172

++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

4173

++	bfqd->bfq_raising_max_softrt_rate = 7000; /*

4174

++						   * Approximate rate required

4175

++						   * to playback or record a

4176

++						   * high-definition compressed

4177

++						   * video.

4178

++						   */

4179

++	bfqd->raised_busy_queues = 0;

4180

++

4181

++	/* Initially estimate the device's peak rate as the reference rate */

4182

++	if (blk_queue_nonrot(bfqd->queue)) {

4183

++		bfqd->RT_prod = R_nonrot * T_nonrot;

4184

++		bfqd->peak_rate = R_nonrot;

4185

++	} else {

4186

++		bfqd->RT_prod = R_rot * T_rot;

4187

++		bfqd->peak_rate = R_rot;

4188

++	}

4189

++

4190

++	return 0;

4191

++}

4192

++

4193

++static void bfq_slab_kill(void)

4194

++{

4195

++	if (bfq_pool != NULL)

4196

++		kmem_cache_destroy(bfq_pool);

4197

++}

4198

++

4199

++static int __init bfq_slab_setup(void)

4200

++{

4201

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4202

++	if (bfq_pool == NULL)

4203

++		return -ENOMEM;

4204

++	return 0;

4205

++}

4206

++

4207

++static ssize_t bfq_var_show(unsigned int var, char *page)

4208

++{

4209

++	return sprintf(page, "%d\n", var);

4210

++}

4211

++

4212

++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

4213

++{

4214

++	unsigned long new_val;

4215

++	int ret = kstrtoul(page, 10, &new_val);

4216

++

4217

++	if (ret == 0)

4218

++		*var = new_val;

4219

++

4220

++	return count;

4221

++}

4222

++

4223

++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

4224

++{

4225

++	struct bfq_data *bfqd = e->elevator_data;

4226

++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

4227

++		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

4228

++		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

4229

++}

4230

++

4231

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4232

++{

4233

++	struct bfq_queue *bfqq;

4234

++	struct bfq_data *bfqd = e->elevator_data;

4235

++	ssize_t num_char = 0;

4236

++

4237

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4238

++			    bfqd->queued);

4239

++

4240

++	spin_lock_irq(bfqd->queue->queue_lock);

4241

++

4242

++	num_char += sprintf(page + num_char, "Active:\n");

4243

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4244

++	  num_char += sprintf(page + num_char,

4245

++			      "pid%d: weight %hu, nr_queued %d %d,"

4246

++			      " dur %d/%u\n",

4247

++			      bfqq->pid,

4248

++			      bfqq->entity.weight,

4249

++			      bfqq->queued[0],

4250

++			      bfqq->queued[1],

4251

++			jiffies_to_msecs(jiffies -

4252

++				bfqq->last_rais_start_finish),

4253

++			jiffies_to_msecs(bfqq->raising_cur_max_time));

4254

++	}

4255

++

4256

++	num_char += sprintf(page + num_char, "Idle:\n");

4257

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4258

++			num_char += sprintf(page + num_char,

4259

++				"pid%d: weight %hu, dur %d/%u\n",

4260

++				bfqq->pid,

4261

++				bfqq->entity.weight,

4262

++				jiffies_to_msecs(jiffies -

4263

++					bfqq->last_rais_start_finish),

4264

++				jiffies_to_msecs(bfqq->raising_cur_max_time));

4265

++	}

4266

++

4267

++	spin_unlock_irq(bfqd->queue->queue_lock);

4268

++

4269

++	return num_char;

4270

++}

4271

++

4272

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4273

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4274

++{									\

4275

++	struct bfq_data *bfqd = e->elevator_data;			\

4276

++	unsigned int __data = __VAR;					\

4277

++	if (__CONV)							\

4278

++		__data = jiffies_to_msecs(__data);			\

4279

++	return bfq_var_show(__data, (page));				\

4280

++}

4281

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4282

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4283

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4284

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4285

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4286

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4287

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4288

++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

4289

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4290

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4291

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4292

++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

4293

++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

4294

++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

4295

++	1);

4296

++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

4297

++	bfqd->bfq_raising_min_inter_arr_async,

4298

++	1);

4299

++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

4300

++	bfqd->bfq_raising_max_softrt_rate, 0);

4301

++#undef SHOW_FUNCTION

4302

++

4303

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4304

++static ssize_t								\

4305

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4306

++{									\

4307

++	struct bfq_data *bfqd = e->elevator_data;			\

4308

++	unsigned long uninitialized_var(__data);			\

4309

++	int ret = bfq_var_store(&__data, (page), count);		\

4310

++	if (__data < (MIN))						\

4311

++		__data = (MIN);						\

4312

++	else if (__data > (MAX))					\

4313

++		__data = (MAX);						\

4314

++	if (__CONV)							\

4315

++		*(__PTR) = msecs_to_jiffies(__data);			\

4316

++	else								\

4317

++		*(__PTR) = __data;					\

4318

++	return ret;							\

4319

++}

4320

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4321

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4322

++		INT_MAX, 1);

4323

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4324

++		INT_MAX, 1);

4325

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4326

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4327

++		INT_MAX, 0);

4328

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4329

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4330

++		1, INT_MAX, 0);

4331

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4332

++		INT_MAX, 1);

4333

++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

4334

++		INT_MAX, 0);

4335

++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

4336

++		INT_MAX, 1);

4337

++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

4338

++		INT_MAX, 1);

4339

++STORE_FUNCTION(bfq_raising_min_idle_time_store,

4340

++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

4341

++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

4342

++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

4343

++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

4344

++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

4345

++#undef STORE_FUNCTION

4346

++

4347

++/* do nothing for the moment */

4348

++static ssize_t bfq_weights_store(struct elevator_queue *e,

4349

++				    const char *page, size_t count)

4350

++{

4351

++	return count;

4352

++}

4353

++

4354

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4355

++{

4356

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4357

++

4358

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4359

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4360

++	else

4361

++		return bfq_default_max_budget;

4362

++}

4363

++

4364

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4365

++				    const char *page, size_t count)

4366

++{

4367

++	struct bfq_data *bfqd = e->elevator_data;

4368

++	unsigned long uninitialized_var(__data);

4369

++	int ret = bfq_var_store(&__data, (page), count);

4370

++

4371

++	if (__data == 0)

4372

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4373

++	else {

4374

++		if (__data > INT_MAX)

4375

++			__data = INT_MAX;

4376

++		bfqd->bfq_max_budget = __data;

4377

++	}

4378

++

4379

++	bfqd->bfq_user_max_budget = __data;

4380

++

4381

++	return ret;

4382

++}

4383

++

4384

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4385

++				      const char *page, size_t count)

4386

++{

4387

++	struct bfq_data *bfqd = e->elevator_data;

4388

++	unsigned long uninitialized_var(__data);

4389

++	int ret = bfq_var_store(&__data, (page), count);

4390

++

4391

++	if (__data < 1)

4392

++		__data = 1;

4393

++	else if (__data > INT_MAX)

4394

++		__data = INT_MAX;

4395

++

4396

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4397

++	if (bfqd->bfq_user_max_budget == 0)

4398

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4399

++

4400

++	return ret;

4401

++}

4402

++

4403

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

4404

++				     const char *page, size_t count)

4405

++{

4406

++	struct bfq_data *bfqd = e->elevator_data;

4407

++	unsigned long uninitialized_var(__data);

4408

++	int ret = bfq_var_store(&__data, (page), count);

4409

++

4410

++	if (__data > 1)

4411

++		__data = 1;

4412

++	if (__data == 0 && bfqd->low_latency != 0)

4413

++		bfq_end_raising(bfqd);

4414

++	bfqd->low_latency = __data;

4415

++

4416

++	return ret;

4417

++}

4418

++

4419

++#define BFQ_ATTR(name) \

4420

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

4421

++

4422

++static struct elv_fs_entry bfq_attrs[] = {

4423

++	BFQ_ATTR(quantum),

4424

++	BFQ_ATTR(fifo_expire_sync),

4425

++	BFQ_ATTR(fifo_expire_async),

4426

++	BFQ_ATTR(back_seek_max),

4427

++	BFQ_ATTR(back_seek_penalty),

4428

++	BFQ_ATTR(slice_idle),

4429

++	BFQ_ATTR(max_budget),

4430

++	BFQ_ATTR(max_budget_async_rq),

4431

++	BFQ_ATTR(timeout_sync),

4432

++	BFQ_ATTR(timeout_async),

4433

++	BFQ_ATTR(low_latency),

4434

++	BFQ_ATTR(raising_coeff),

4435

++	BFQ_ATTR(raising_max_time),

4436

++	BFQ_ATTR(raising_rt_max_time),

4437

++	BFQ_ATTR(raising_min_idle_time),

4438

++	BFQ_ATTR(raising_min_inter_arr_async),

4439

++	BFQ_ATTR(raising_max_softrt_rate),

4440

++	BFQ_ATTR(weights),

4441

++	__ATTR_NULL

4442

++};

4443

++

4444

++static struct elevator_type iosched_bfq = {

4445

++	.ops = {

4446

++		.elevator_merge_fn =		bfq_merge,

4447

++		.elevator_merged_fn =		bfq_merged_request,

4448

++		.elevator_merge_req_fn =	bfq_merged_requests,

4449

++		.elevator_allow_merge_fn =	bfq_allow_merge,

4450

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

4451

++		.elevator_add_req_fn =		bfq_insert_request,

4452

++		.elevator_activate_req_fn =	bfq_activate_request,

4453

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

4454

++		.elevator_completed_req_fn =	bfq_completed_request,

4455

++		.elevator_former_req_fn =	elv_rb_former_request,

4456

++		.elevator_latter_req_fn =	elv_rb_latter_request,

4457

++		.elevator_init_icq_fn =		bfq_init_icq,

4458

++		.elevator_exit_icq_fn =		bfq_exit_icq,

4459

++		.elevator_set_req_fn =		bfq_set_request,

4460

++		.elevator_put_req_fn =		bfq_put_request,

4461

++		.elevator_may_queue_fn =	bfq_may_queue,

4462

++		.elevator_init_fn =		bfq_init_queue,

4463

++		.elevator_exit_fn =		bfq_exit_queue,

4464

++	},

4465

++	.icq_size =		sizeof(struct bfq_io_cq),

4466

++	.icq_align =		__alignof__(struct bfq_io_cq),

4467

++	.elevator_attrs =	bfq_attrs,

4468

++	.elevator_name =	"bfq",

4469

++	.elevator_owner =	THIS_MODULE,

4470

++};

4471

++

4472

++static int __init bfq_init(void)

4473

++{

4474

++	/*

4475

++	 * Can be 0 on HZ < 1000 setups.

4476

++	 */

4477

++	if (bfq_slice_idle == 0)

4478

++		bfq_slice_idle = 1;

4479

++

4480

++	if (bfq_timeout_async == 0)

4481

++		bfq_timeout_async = 1;

4482

++

4483

++	if (bfq_slab_setup())

4484

++		return -ENOMEM;

4485

++

4486

++	elv_register(&iosched_bfq);

4487

++	pr_info("BFQ I/O-scheduler version: v7r2");

4488

++

4489

++	return 0;

4490

++}

4491

++

4492

++static void __exit bfq_exit(void)

4493

++{

4494

++	elv_unregister(&iosched_bfq);

4495

++	bfq_slab_kill();

4496

++}

4497

++

4498

++module_init(bfq_init);

4499

++module_exit(bfq_exit);

4500

++

4501

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

4502

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

4503

+new file mode 100644

4504

+index 0000000..999b475

4505

+--- /dev/null

4506

++++ b/block/bfq-sched.c

4507

+@@ -0,0 +1,1078 @@

4508

++/*

4509

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

4510

++ *

4511

++ * Based on ideas and code from CFQ:

4512

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

4513

++ *

4514

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

4515

++ *		      Paolo Valente <paolo.valente@×××××××.it>

4516

++ *

4517

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

4518

++ */

4519

++

4520

++#ifdef CONFIG_CGROUP_BFQIO

4521

++#define for_each_entity(entity)	\

4522

++	for (; entity != NULL; entity = entity->parent)

4523

++

4524

++#define for_each_entity_safe(entity, parent) \

4525

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

4526

++

4527

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

4528

++						 int extract,

4529

++						 struct bfq_data *bfqd);

4530

++

4531

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

4532

++{

4533

++	struct bfq_entity *bfqg_entity;

4534

++	struct bfq_group *bfqg;

4535

++	struct bfq_sched_data *group_sd;

4536

++

4537

++	BUG_ON(next_in_service == NULL);

4538

++

4539

++	group_sd = next_in_service->sched_data;

4540

++

4541

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

4542

++	/*

4543

++	 * bfq_group's my_entity field is not NULL only if the group

4544

++	 * is not the root group. We must not touch the root entity

4545

++	 * as it must never become an in-service entity.

4546

++	 */

4547

++	bfqg_entity = bfqg->my_entity;

4548

++	if (bfqg_entity != NULL)

4549

++		bfqg_entity->budget = next_in_service->budget;

4550

++}

4551

++

4552

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

4553

++{

4554

++	struct bfq_entity *next_in_service;

4555

++

4556

++	if (sd->in_service_entity != NULL)

4557

++		/* will update/requeue at the end of service */

4558

++		return 0;

4559

++

4560

++	/*

4561

++	 * NOTE: this can be improved in many ways, such as returning

4562

++	 * 1 (and thus propagating upwards the update) only when the

4563

++	 * budget changes, or caching the bfqq that will be scheduled

4564

++	 * next from this subtree.  By now we worry more about

4565

++	 * correctness than about performance...

4566

++	 */

4567

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

4568

++	sd->next_in_service = next_in_service;

4569

++

4570

++	if (next_in_service != NULL)

4571

++		bfq_update_budget(next_in_service);

4572

++

4573

++	return 1;

4574

++}

4575

++

4576

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

4577

++					     struct bfq_entity *entity)

4578

++{

4579

++	BUG_ON(sd->next_in_service != entity);

4580

++}

4581

++#else

4582

++#define for_each_entity(entity)	\

4583

++	for (; entity != NULL; entity = NULL)

4584

++

4585

++#define for_each_entity_safe(entity, parent) \

4586

++	for (parent = NULL; entity != NULL; entity = parent)

4587

++

4588

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

4589

++{

4590

++	return 0;

4591

++}

4592

++

4593

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

4594

++					     struct bfq_entity *entity)

4595

++{

4596

++}

4597

++

4598

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

4599

++{

4600

++}

4601

++#endif

4602

++

4603

++/*

4604

++ * Shift for timestamp calculations.  This actually limits the maximum

4605

++ * service allowed in one timestamp delta (small shift values increase it),

4606

++ * the maximum total weight that can be used for the queues in the system

4607

++ * (big shift values increase it), and the period of virtual time wraparounds.

4608

++ */

4609

++#define WFQ_SERVICE_SHIFT	22

4610

++

4611

++/**

4612

++ * bfq_gt - compare two timestamps.

4613

++ * @a: first ts.

4614

++ * @b: second ts.

4615

++ *

4616

++ * Return @a > @b, dealing with wrapping correctly.

4617

++ */

4618

++static inline int bfq_gt(u64 a, u64 b)

4619

++{

4620

++	return (s64)(a - b) > 0;

4621

++}

4622

++

4623

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

4624

++{

4625

++	struct bfq_queue *bfqq = NULL;

4626

++

4627

++	BUG_ON(entity == NULL);

4628

++

4629

++	if (entity->my_sched_data == NULL)

4630

++		bfqq = container_of(entity, struct bfq_queue, entity);

4631

++

4632

++	return bfqq;

4633

++}

4634

++

4635

++

4636

++/**

4637

++ * bfq_delta - map service into the virtual time domain.

4638

++ * @service: amount of service.

4639

++ * @weight: scale factor (weight of an entity or weight sum).

4640

++ */

4641

++static inline u64 bfq_delta(unsigned long service,

4642

++					unsigned long weight)

4643

++{

4644

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

4645

++

4646

++	do_div(d, weight);

4647

++	return d;

4648

++}

4649

++

4650

++/**

4651

++ * bfq_calc_finish - assign the finish time to an entity.

4652

++ * @entity: the entity to act upon.

4653

++ * @service: the service to be charged to the entity.

4654

++ */

4655

++static inline void bfq_calc_finish(struct bfq_entity *entity,

4656

++				   unsigned long service)

4657

++{

4658

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4659

++

4660

++	BUG_ON(entity->weight == 0);

4661

++

4662

++	entity->finish = entity->start +

4663

++		bfq_delta(service, entity->weight);

4664

++

4665

++	if (bfqq != NULL) {

4666

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

4667

++			"calc_finish: serv %lu, w %d",

4668

++			service, entity->weight);

4669

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

4670

++			"calc_finish: start %llu, finish %llu, delta %llu",

4671

++			entity->start, entity->finish,

4672

++			bfq_delta(service, entity->weight));

4673

++	}

4674

++}

4675

++

4676

++/**

4677

++ * bfq_entity_of - get an entity from a node.

4678

++ * @node: the node field of the entity.

4679

++ *

4680

++ * Convert a node pointer to the relative entity.  This is used only

4681

++ * to simplify the logic of some functions and not as the generic

4682

++ * conversion mechanism because, e.g., in the tree walking functions,

4683

++ * the check for a %NULL value would be redundant.

4684

++ */

4685

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

4686

++{

4687

++	struct bfq_entity *entity = NULL;

4688

++

4689

++	if (node != NULL)

4690

++		entity = rb_entry(node, struct bfq_entity, rb_node);

4691

++

4692

++	return entity;

4693

++}

4694

++

4695

++/**

4696

++ * bfq_extract - remove an entity from a tree.

4697

++ * @root: the tree root.

4698

++ * @entity: the entity to remove.

4699

++ */

4700

++static inline void bfq_extract(struct rb_root *root,

4701

++			       struct bfq_entity *entity)

4702

++{

4703

++	BUG_ON(entity->tree != root);

4704

++

4705

++	entity->tree = NULL;

4706

++	rb_erase(&entity->rb_node, root);

4707

++}

4708

++

4709

++/**

4710

++ * bfq_idle_extract - extract an entity from the idle tree.

4711

++ * @st: the service tree of the owning @entity.

4712

++ * @entity: the entity being removed.

4713

++ */

4714

++static void bfq_idle_extract(struct bfq_service_tree *st,

4715

++			     struct bfq_entity *entity)

4716

++{

4717

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4718

++	struct rb_node *next;

4719

++

4720

++	BUG_ON(entity->tree != &st->idle);

4721

++

4722

++	if (entity == st->first_idle) {

4723

++		next = rb_next(&entity->rb_node);

4724

++		st->first_idle = bfq_entity_of(next);

4725

++	}

4726

++

4727

++	if (entity == st->last_idle) {

4728

++		next = rb_prev(&entity->rb_node);

4729

++		st->last_idle = bfq_entity_of(next);

4730

++	}

4731

++

4732

++	bfq_extract(&st->idle, entity);

4733

++

4734

++	if (bfqq != NULL)

4735

++		list_del(&bfqq->bfqq_list);

4736

++}

4737

++

4738

++/**

4739

++ * bfq_insert - generic tree insertion.

4740

++ * @root: tree root.

4741

++ * @entity: entity to insert.

4742

++ *

4743

++ * This is used for the idle and the active tree, since they are both

4744

++ * ordered by finish time.

4745

++ */

4746

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

4747

++{

4748

++	struct bfq_entity *entry;

4749

++	struct rb_node **node = &root->rb_node;

4750

++	struct rb_node *parent = NULL;

4751

++

4752

++	BUG_ON(entity->tree != NULL);

4753

++

4754

++	while (*node != NULL) {

4755

++		parent = *node;

4756

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

4757

++

4758

++		if (bfq_gt(entry->finish, entity->finish))

4759

++			node = &parent->rb_left;

4760

++		else

4761

++			node = &parent->rb_right;

4762

++	}

4763

++

4764

++	rb_link_node(&entity->rb_node, parent, node);

4765

++	rb_insert_color(&entity->rb_node, root);

4766

++

4767

++	entity->tree = root;

4768

++}

4769

++

4770

++/**

4771

++ * bfq_update_min - update the min_start field of a entity.

4772

++ * @entity: the entity to update.

4773

++ * @node: one of its children.

4774

++ *

4775

++ * This function is called when @entity may store an invalid value for

4776

++ * min_start due to updates to the active tree.  The function  assumes

4777

++ * that the subtree rooted at @node (which may be its left or its right

4778

++ * child) has a valid min_start value.

4779

++ */

4780

++static inline void bfq_update_min(struct bfq_entity *entity,

4781

++				  struct rb_node *node)

4782

++{

4783

++	struct bfq_entity *child;

4784

++

4785

++	if (node != NULL) {

4786

++		child = rb_entry(node, struct bfq_entity, rb_node);

4787

++		if (bfq_gt(entity->min_start, child->min_start))

4788

++			entity->min_start = child->min_start;

4789

++	}

4790

++}

4791

++

4792

++/**

4793

++ * bfq_update_active_node - recalculate min_start.

4794

++ * @node: the node to update.

4795

++ *

4796

++ * @node may have changed position or one of its children may have moved,

4797

++ * this function updates its min_start value.  The left and right subtrees

4798

++ * are assumed to hold a correct min_start value.

4799

++ */

4800

++static inline void bfq_update_active_node(struct rb_node *node)

4801

++{

4802

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

4803

++

4804

++	entity->min_start = entity->start;

4805

++	bfq_update_min(entity, node->rb_right);

4806

++	bfq_update_min(entity, node->rb_left);

4807

++}

4808

++

4809

++/**

4810

++ * bfq_update_active_tree - update min_start for the whole active tree.

4811

++ * @node: the starting node.

4812

++ *

4813

++ * @node must be the deepest modified node after an update.  This function

4814

++ * updates its min_start using the values held by its children, assuming

4815

++ * that they did not change, and then updates all the nodes that may have

4816

++ * changed in the path to the root.  The only nodes that may have changed

4817

++ * are the ones in the path or their siblings.

4818

++ */

4819

++static void bfq_update_active_tree(struct rb_node *node)

4820

++{

4821

++	struct rb_node *parent;

4822

++

4823

++up:

4824

++	bfq_update_active_node(node);

4825

++

4826

++	parent = rb_parent(node);

4827

++	if (parent == NULL)

4828

++		return;

4829

++

4830

++	if (node == parent->rb_left && parent->rb_right != NULL)

4831

++		bfq_update_active_node(parent->rb_right);

4832

++	else if (parent->rb_left != NULL)

4833

++		bfq_update_active_node(parent->rb_left);

4834

++

4835

++	node = parent;

4836

++	goto up;

4837

++}

4838

++

4839

++/**

4840

++ * bfq_active_insert - insert an entity in the active tree of its group/device.

4841

++ * @st: the service tree of the entity.

4842

++ * @entity: the entity being inserted.

4843

++ *

4844

++ * The active tree is ordered by finish time, but an extra key is kept

4845

++ * per each node, containing the minimum value for the start times of

4846

++ * its children (and the node itself), so it's possible to search for

4847

++ * the eligible node with the lowest finish time in logarithmic time.

4848

++ */

4849

++static void bfq_active_insert(struct bfq_service_tree *st,

4850

++			      struct bfq_entity *entity)

4851

++{

4852

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4853

++	struct rb_node *node = &entity->rb_node;

4854

++

4855

++	bfq_insert(&st->active, entity);

4856

++

4857

++	if (node->rb_left != NULL)

4858

++		node = node->rb_left;

4859

++	else if (node->rb_right != NULL)

4860

++		node = node->rb_right;

4861

++

4862

++	bfq_update_active_tree(node);

4863

++

4864

++	if (bfqq != NULL)

4865

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

4866

++}

4867

++

4868

++/**

4869

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

4870

++ * @ioprio: the ioprio value to convert.

4871

++ */

4872

++static unsigned short bfq_ioprio_to_weight(int ioprio)

4873

++{

4874

++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

4875

++	return IOPRIO_BE_NR - ioprio;

4876

++}

4877

++

4878

++/**

4879

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

4880

++ * @weight: the weight value to convert.

4881

++ *

4882

++ * To preserve as mush as possible the old only-ioprio user interface,

4883

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

4884

++ * larger than IOPRIO_BE_NR

4885

++ */

4886

++static unsigned short bfq_weight_to_ioprio(int weight)

4887

++{

4888

++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

4889

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

4890

++}

4891

++

4892

++static inline void bfq_get_entity(struct bfq_entity *entity)

4893

++{

4894

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4895

++	struct bfq_sched_data *sd;

4896

++

4897

++	if (bfqq != NULL) {

4898

++		sd = entity->sched_data;

4899

++		atomic_inc(&bfqq->ref);

4900

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

4901

++			     bfqq, atomic_read(&bfqq->ref));

4902

++	}

4903

++}

4904

++

4905

++/**

4906

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

4907

++ * @node: the node being removed.

4908

++ *

4909

++ * Do the first step of an extraction in an rb tree, looking for the

4910

++ * node that will replace @node, and returning the deepest node that

4911

++ * the following modifications to the tree can touch.  If @node is the

4912

++ * last node in the tree return %NULL.

4913

++ */

4914

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

4915

++{

4916

++	struct rb_node *deepest;

4917

++

4918

++	if (node->rb_right == NULL && node->rb_left == NULL)

4919

++		deepest = rb_parent(node);

4920

++	else if (node->rb_right == NULL)

4921

++		deepest = node->rb_left;

4922

++	else if (node->rb_left == NULL)

4923

++		deepest = node->rb_right;

4924

++	else {

4925

++		deepest = rb_next(node);

4926

++		if (deepest->rb_right != NULL)

4927

++			deepest = deepest->rb_right;

4928

++		else if (rb_parent(deepest) != node)

4929

++			deepest = rb_parent(deepest);

4930

++	}

4931

++

4932

++	return deepest;

4933

++}

4934

++

4935

++/**

4936

++ * bfq_active_extract - remove an entity from the active tree.

4937

++ * @st: the service_tree containing the tree.

4938

++ * @entity: the entity being removed.

4939

++ */

4940

++static void bfq_active_extract(struct bfq_service_tree *st,

4941

++			       struct bfq_entity *entity)

4942

++{

4943

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4944

++	struct rb_node *node;

4945

++

4946

++	node = bfq_find_deepest(&entity->rb_node);

4947

++	bfq_extract(&st->active, entity);

4948

++

4949

++	if (node != NULL)

4950

++		bfq_update_active_tree(node);

4951

++

4952

++	if (bfqq != NULL)

4953

++		list_del(&bfqq->bfqq_list);

4954

++}

4955

++

4956

++/**

4957

++ * bfq_idle_insert - insert an entity into the idle tree.

4958

++ * @st: the service tree containing the tree.

4959

++ * @entity: the entity to insert.

4960

++ */

4961

++static void bfq_idle_insert(struct bfq_service_tree *st,

4962

++			    struct bfq_entity *entity)

4963

++{

4964

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4965

++	struct bfq_entity *first_idle = st->first_idle;

4966

++	struct bfq_entity *last_idle = st->last_idle;

4967

++

4968

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

4969

++		st->first_idle = entity;

4970

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

4971

++		st->last_idle = entity;

4972

++

4973

++	bfq_insert(&st->idle, entity);

4974

++

4975

++	if (bfqq != NULL)

4976

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

4977

++}

4978

++

4979

++/**

4980

++ * bfq_forget_entity - remove an entity from the wfq trees.

4981

++ * @st: the service tree.

4982

++ * @entity: the entity being removed.

4983

++ *

4984

++ * Update the device status and forget everything about @entity, putting

4985

++ * the device reference to it, if it is a queue.  Entities belonging to

4986

++ * groups are not refcounted.

4987

++ */

4988

++static void bfq_forget_entity(struct bfq_service_tree *st,

4989

++			      struct bfq_entity *entity)

4990

++{

4991

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4992

++	struct bfq_sched_data *sd;

4993

++

4994

++	BUG_ON(!entity->on_st);

4995

++

4996

++	entity->on_st = 0;

4997

++	st->wsum -= entity->weight;

4998

++	if (bfqq != NULL) {

4999

++		sd = entity->sched_data;

5000

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5001

++			     bfqq, atomic_read(&bfqq->ref));

5002

++		bfq_put_queue(bfqq);

5003

++	}

5004

++}

5005

++

5006

++/**

5007

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5008

++ * @st: service tree for the entity.

5009

++ * @entity: the entity being released.

5010

++ */

5011

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5012

++				struct bfq_entity *entity)

5013

++{

5014

++	bfq_idle_extract(st, entity);

5015

++	bfq_forget_entity(st, entity);

5016

++}

5017

++

5018

++/**

5019

++ * bfq_forget_idle - update the idle tree if necessary.

5020

++ * @st: the service tree to act upon.

5021

++ *

5022

++ * To preserve the global O(log N) complexity we only remove one entry here;

5023

++ * as the idle tree will not grow indefinitely this can be done safely.

5024

++ */

5025

++static void bfq_forget_idle(struct bfq_service_tree *st)

5026

++{

5027

++	struct bfq_entity *first_idle = st->first_idle;

5028

++	struct bfq_entity *last_idle = st->last_idle;

5029

++

5030

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5031

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5032

++		/*

5033

++		 * Forget the whole idle tree, increasing the vtime past

5034

++		 * the last finish time of idle entities.

5035

++		 */

5036

++		st->vtime = last_idle->finish;

5037

++	}

5038

++

5039

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5040

++		bfq_put_idle_entity(st, first_idle);

5041

++}

5042

++

5043

++static struct bfq_service_tree *

5044

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5045

++			 struct bfq_entity *entity)

5046

++{

5047

++	struct bfq_service_tree *new_st = old_st;

5048

++

5049

++	if (entity->ioprio_changed) {

5050

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5051

++

5052

++		BUG_ON(old_st->wsum < entity->weight);

5053

++		old_st->wsum -= entity->weight;

5054

++

5055

++		if (entity->new_weight != entity->orig_weight) {

5056

++			entity->orig_weight = entity->new_weight;

5057

++			entity->ioprio =

5058

++				bfq_weight_to_ioprio(entity->orig_weight);

5059

++		} else if (entity->new_ioprio != entity->ioprio) {

5060

++			entity->ioprio = entity->new_ioprio;

5061

++			entity->orig_weight =

5062

++					bfq_ioprio_to_weight(entity->ioprio);

5063

++		} else

5064

++			entity->new_weight = entity->orig_weight =

5065

++				bfq_ioprio_to_weight(entity->ioprio);

5066

++

5067

++		entity->ioprio_class = entity->new_ioprio_class;

5068

++		entity->ioprio_changed = 0;

5069

++

5070

++		/*

5071

++		 * NOTE: here we may be changing the weight too early,

5072

++		 * this will cause unfairness.  The correct approach

5073

++		 * would have required additional complexity to defer

5074

++		 * weight changes to the proper time instants (i.e.,

5075

++		 * when entity->finish <= old_st->vtime).

5076

++		 */

5077

++		new_st = bfq_entity_service_tree(entity);

5078

++		entity->weight = entity->orig_weight *

5079

++			(bfqq != NULL ? bfqq->raising_coeff : 1);

5080

++		new_st->wsum += entity->weight;

5081

++

5082

++		if (new_st != old_st)

5083

++			entity->start = new_st->vtime;

5084

++	}

5085

++

5086

++	return new_st;

5087

++}

5088

++

5089

++/**

5090

++ * bfq_bfqq_served - update the scheduler status after selection for service.

5091

++ * @bfqq: the queue being served.

5092

++ * @served: bytes to transfer.

5093

++ *

5094

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5095

++ * are synchronized every time a new bfqq is selected for service.  By now,

5096

++ * we keep it to better check consistency.

5097

++ */

5098

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5099

++{

5100

++	struct bfq_entity *entity = &bfqq->entity;

5101

++	struct bfq_service_tree *st;

5102

++

5103

++	for_each_entity(entity) {

5104

++		st = bfq_entity_service_tree(entity);

5105

++

5106

++		entity->service += served;

5107

++		BUG_ON(entity->service > entity->budget);

5108

++		BUG_ON(st->wsum == 0);

5109

++

5110

++		st->vtime += bfq_delta(served, st->wsum);

5111

++		bfq_forget_idle(st);

5112

++	}

5113

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5114

++}

5115

++

5116

++/**

5117

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5118

++ * @bfqq: the queue that needs a service update.

5119

++ *

5120

++ * When it's not possible to be fair in the service domain, because

5121

++ * a queue is not consuming its budget fast enough (the meaning of

5122

++ * fast depends on the timeout parameter), we charge it a full

5123

++ * budget.  In this way we should obtain a sort of time-domain

5124

++ * fairness among all the seeky/slow queues.

5125

++ */

5126

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5127

++{

5128

++	struct bfq_entity *entity = &bfqq->entity;

5129

++

5130

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5131

++

5132

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5133

++}

5134

++

5135

++/**

5136

++ * __bfq_activate_entity - activate an entity.

5137

++ * @entity: the entity being activated.

5138

++ *

5139

++ * Called whenever an entity is activated, i.e., it is not active and one

5140

++ * of its children receives a new request, or has to be reactivated due to

5141

++ * budget exhaustion.  It uses the current budget of the entity (and the

5142

++ * service received if @entity is active) of the queue to calculate its

5143

++ * timestamps.

5144

++ */

5145

++static void __bfq_activate_entity(struct bfq_entity *entity)

5146

++{

5147

++	struct bfq_sched_data *sd = entity->sched_data;

5148

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5149

++

5150

++	if (entity == sd->in_service_entity) {

5151

++		BUG_ON(entity->tree != NULL);

5152

++		/*

5153

++		 * If we are requeueing the current entity we have

5154

++		 * to take care of not charging to it service it has

5155

++		 * not received.

5156

++		 */

5157

++		bfq_calc_finish(entity, entity->service);

5158

++		entity->start = entity->finish;

5159

++		sd->in_service_entity = NULL;

5160

++	} else if (entity->tree == &st->active) {

5161

++		/*

5162

++		 * Requeueing an entity due to a change of some

5163

++		 * next_in_service entity below it.  We reuse the

5164

++		 * old start time.

5165

++		 */

5166

++		bfq_active_extract(st, entity);

5167

++	} else if (entity->tree == &st->idle) {

5168

++		/*

5169

++		 * Must be on the idle tree, bfq_idle_extract() will

5170

++		 * check for that.

5171

++		 */

5172

++		bfq_idle_extract(st, entity);

5173

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

5174

++				       st->vtime : entity->finish;

5175

++	} else {

5176

++		/*

5177

++		 * The finish time of the entity may be invalid, and

5178

++		 * it is in the past for sure, otherwise the queue

5179

++		 * would have been on the idle tree.

5180

++		 */

5181

++		entity->start = st->vtime;

5182

++		st->wsum += entity->weight;

5183

++		bfq_get_entity(entity);

5184

++

5185

++		BUG_ON(entity->on_st);

5186

++		entity->on_st = 1;

5187

++	}

5188

++

5189

++	st = __bfq_entity_update_weight_prio(st, entity);

5190

++	bfq_calc_finish(entity, entity->budget);

5191

++	bfq_active_insert(st, entity);

5192

++}

5193

++

5194

++/**

5195

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5196

++ * @entity: the entity to activate.

5197

++ *

5198

++ * Activate @entity and all the entities on the path from it to the root.

5199

++ */

5200

++static void bfq_activate_entity(struct bfq_entity *entity)

5201

++{

5202

++	struct bfq_sched_data *sd;

5203

++

5204

++	for_each_entity(entity) {

5205

++		__bfq_activate_entity(entity);

5206

++

5207

++		sd = entity->sched_data;

5208

++		if (!bfq_update_next_in_service(sd))

5209

++			/*

5210

++			 * No need to propagate the activation to the

5211

++			 * upper entities, as they will be updated when

5212

++			 * the in-service entity is rescheduled.

5213

++			 */

5214

++			break;

5215

++	}

5216

++}

5217

++

5218

++/**

5219

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5220

++ * @entity: the entity to deactivate.

5221

++ * @requeue: if false, the entity will not be put into the idle tree.

5222

++ *

5223

++ * Deactivate an entity, independently from its previous state.  If the

5224

++ * entity was not on a service tree just return, otherwise if it is on

5225

++ * any scheduler tree, extract it from that tree, and if necessary

5226

++ * and if the caller did not specify @requeue, put it on the idle tree.

5227

++ *

5228

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

5229

++ * if the entity was under service or if it was the next_in_service for

5230

++ * its sched_data; return %0 otherwise.

5231

++ */

5232

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5233

++{

5234

++	struct bfq_sched_data *sd = entity->sched_data;

5235

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5236

++	int was_in_service = entity == sd->in_service_entity;

5237

++	int ret = 0;

5238

++

5239

++	if (!entity->on_st)

5240

++		return 0;

5241

++

5242

++	BUG_ON(was_in_service && entity->tree != NULL);

5243

++

5244

++	if (was_in_service) {

5245

++		bfq_calc_finish(entity, entity->service);

5246

++		sd->in_service_entity = NULL;

5247

++	} else if (entity->tree == &st->active)

5248

++		bfq_active_extract(st, entity);

5249

++	else if (entity->tree == &st->idle)

5250

++		bfq_idle_extract(st, entity);

5251

++	else if (entity->tree != NULL)

5252

++		BUG();

5253

++

5254

++	if (was_in_service || sd->next_in_service == entity)

5255

++		ret = bfq_update_next_in_service(sd);

5256

++

5257

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5258

++		bfq_forget_entity(st, entity);

5259

++	else

5260

++		bfq_idle_insert(st, entity);

5261

++

5262

++	BUG_ON(sd->in_service_entity == entity);

5263

++	BUG_ON(sd->next_in_service == entity);

5264

++

5265

++	return ret;

5266

++}

5267

++

5268

++/**

5269

++ * bfq_deactivate_entity - deactivate an entity.

5270

++ * @entity: the entity to deactivate.

5271

++ * @requeue: true if the entity can be put on the idle tree

5272

++ */

5273

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5274

++{

5275

++	struct bfq_sched_data *sd;

5276

++	struct bfq_entity *parent;

5277

++

5278

++	for_each_entity_safe(entity, parent) {

5279

++		sd = entity->sched_data;

5280

++

5281

++		if (!__bfq_deactivate_entity(entity, requeue))

5282

++			/*

5283

++			 * The parent entity is still backlogged, and

5284

++			 * we don't need to update it as it is still

5285

++			 * under service.

5286

++			 */

5287

++			break;

5288

++

5289

++		if (sd->next_in_service != NULL)

5290

++			/*

5291

++			 * The parent entity is still backlogged and

5292

++			 * the budgets on the path towards the root

5293

++			 * need to be updated.

5294

++			 */

5295

++			goto update;

5296

++

5297

++		/*

5298

++		 * If we reach there the parent is no more backlogged and

5299

++		 * we want to propagate the dequeue upwards.

5300

++		 */

5301

++		requeue = 1;

5302

++	}

5303

++

5304

++	return;

5305

++

5306

++update:

5307

++	entity = parent;

5308

++	for_each_entity(entity) {

5309

++		__bfq_activate_entity(entity);

5310

++

5311

++		sd = entity->sched_data;

5312

++		if (!bfq_update_next_in_service(sd))

5313

++			break;

5314

++	}

5315

++}

5316

++

5317

++/**

5318

++ * bfq_update_vtime - update vtime if necessary.

5319

++ * @st: the service tree to act upon.

5320

++ *

5321

++ * If necessary update the service tree vtime to have at least one

5322

++ * eligible entity, skipping to its start time.  Assumes that the

5323

++ * active tree of the device is not empty.

5324

++ *

5325

++ * NOTE: this hierarchical implementation updates vtimes quite often,

5326

++ * we may end up with reactivated tasks getting timestamps after a

5327

++ * vtime skip done because we needed a ->first_active entity on some

5328

++ * intermediate node.

5329

++ */

5330

++static void bfq_update_vtime(struct bfq_service_tree *st)

5331

++{

5332

++	struct bfq_entity *entry;

5333

++	struct rb_node *node = st->active.rb_node;

5334

++

5335

++	entry = rb_entry(node, struct bfq_entity, rb_node);

5336

++	if (bfq_gt(entry->min_start, st->vtime)) {

5337

++		st->vtime = entry->min_start;

5338

++		bfq_forget_idle(st);

5339

++	}

5340

++}

5341

++

5342

++/**

5343

++ * bfq_first_active_entity - find the eligible entity with

5344

++ *                           the smallest finish time

5345

++ * @st: the service tree to select from.

5346

++ *

5347

++ * This function searches the first schedulable entity, starting from the

5348

++ * root of the tree and going on the left every time on this side there is

5349

++ * a subtree with at least one eligible (start >= vtime) entity.  The path

5350

++ * on the right is followed only if a) the left subtree contains no eligible

5351

++ * entities and b) no eligible entity has been found yet.

5352

++ */

5353

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

5354

++{

5355

++	struct bfq_entity *entry, *first = NULL;

5356

++	struct rb_node *node = st->active.rb_node;

5357

++

5358

++	while (node != NULL) {

5359

++		entry = rb_entry(node, struct bfq_entity, rb_node);

5360

++left:

5361

++		if (!bfq_gt(entry->start, st->vtime))

5362

++			first = entry;

5363

++

5364

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

5365

++

5366

++		if (node->rb_left != NULL) {

5367

++			entry = rb_entry(node->rb_left,

5368

++					 struct bfq_entity, rb_node);

5369

++			if (!bfq_gt(entry->min_start, st->vtime)) {

5370

++				node = node->rb_left;

5371

++				goto left;

5372

++			}

5373

++		}

5374

++		if (first != NULL)

5375

++			break;

5376

++		node = node->rb_right;

5377

++	}

5378

++

5379

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

5380

++	return first;

5381

++}

5382

++

5383

++/**

5384

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

5385

++ * @st: the service tree.

5386

++ *

5387

++ * Update the virtual time in @st and return the first eligible entity

5388

++ * it contains.

5389

++ */

5390

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

5391

++						   bool force)

5392

++{

5393

++	struct bfq_entity *entity, *new_next_in_service = NULL;

5394

++

5395

++	if (RB_EMPTY_ROOT(&st->active))

5396

++		return NULL;

5397

++

5398

++	bfq_update_vtime(st);

5399

++	entity = bfq_first_active_entity(st);

5400

++	BUG_ON(bfq_gt(entity->start, st->vtime));

5401

++

5402

++	/*

5403

++	 * If the chosen entity does not match with the sched_data's

5404

++	 * next_in_service and we are forcedly serving the IDLE priority

5405

++	 * class tree, bubble up budget update.

5406

++	 */

5407

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

5408

++		new_next_in_service = entity;

5409

++		for_each_entity(new_next_in_service)

5410

++			bfq_update_budget(new_next_in_service);

5411

++	}

5412

++

5413

++	return entity;

5414

++}

5415

++

5416

++/**

5417

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

5418

++ * @sd: the sched_data.

5419

++ * @extract: if true the returned entity will be also extracted from @sd.

5420

++ *

5421

++ * NOTE: since we cache the next_in_service entity at each level of the

5422

++ * hierarchy, the complexity of the lookup can be decreased with

5423

++ * absolutely no effort just returning the cached next_in_service value;

5424

++ * we prefer to do full lookups to test the consistency of * the data

5425

++ * structures.

5426

++ */

5427

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5428

++						 int extract,

5429

++						 struct bfq_data *bfqd)

5430

++{

5431

++	struct bfq_service_tree *st = sd->service_tree;

5432

++	struct bfq_entity *entity;

5433

++	int i = 0;

5434

++

5435

++	BUG_ON(sd->in_service_entity != NULL);

5436

++

5437

++	if (bfqd != NULL &&

5438

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

5439

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

5440

++						  true);

5441

++		if (entity != NULL) {

5442

++			i = BFQ_IOPRIO_CLASSES - 1;

5443

++			bfqd->bfq_class_idle_last_service = jiffies;

5444

++			sd->next_in_service = entity;

5445

++		}

5446

++	}

5447

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

5448

++		entity = __bfq_lookup_next_entity(st + i, false);

5449

++		if (entity != NULL) {

5450

++			if (extract) {

5451

++				bfq_check_next_in_service(sd, entity);

5452

++				bfq_active_extract(st + i, entity);

5453

++				sd->in_service_entity = entity;

5454

++				sd->next_in_service = NULL;

5455

++			}

5456

++			break;

5457

++		}

5458

++	}

5459

++

5460

++	return entity;

5461

++}

5462

++

5463

++/*

5464

++ * Get next queue for service.

5465

++ */

5466

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

5467

++{

5468

++	struct bfq_entity *entity = NULL;

5469

++	struct bfq_sched_data *sd;

5470

++	struct bfq_queue *bfqq;

5471

++

5472

++	BUG_ON(bfqd->in_service_queue != NULL);

5473

++

5474

++	if (bfqd->busy_queues == 0)

5475

++		return NULL;

5476

++

5477

++	sd = &bfqd->root_group->sched_data;

5478

++	for (; sd != NULL; sd = entity->my_sched_data) {

5479

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

5480

++		BUG_ON(entity == NULL);

5481

++		entity->service = 0;

5482

++	}

5483

++

5484

++	bfqq = bfq_entity_to_bfqq(entity);

5485

++	BUG_ON(bfqq == NULL);

5486

++

5487

++	return bfqq;

5488

++}

5489

++

5490

++/*

5491

++ * Forced extraction of the given queue.

5492

++ */

5493

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

5494

++				      struct bfq_queue *bfqq)

5495

++{

5496

++	struct bfq_entity *entity;

5497

++	struct bfq_sched_data *sd;

5498

++

5499

++	BUG_ON(bfqd->in_service_queue != NULL);

5500

++

5501

++	entity = &bfqq->entity;

5502

++	/*

5503

++	 * Bubble up extraction/update from the leaf to the root.

5504

++	*/

5505

++	for_each_entity(entity) {

5506

++		sd = entity->sched_data;

5507

++		bfq_update_budget(entity);

5508

++		bfq_update_vtime(bfq_entity_service_tree(entity));

5509

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

5510

++		sd->active_entity = entity;

5511

++		sd->next_active = NULL;

5512

++		entity->service = 0;

5513

++	}

5514

++

5515

++	return;

5516

++}

5517

++

5518

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

5519

++{

5520

++	if (bfqd->in_service_bic != NULL) {

5521

++		put_io_context(bfqd->in_service_bic->icq.ioc);

5522

++		bfqd->in_service_bic = NULL;

5523

++	}

5524

++

5525

++	bfqd->in_service_queue = NULL;

5526

++	del_timer(&bfqd->idle_slice_timer);

5527

++}

5528

++

5529

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5530

++				int requeue)

5531

++{

5532

++	struct bfq_entity *entity = &bfqq->entity;

5533

++

5534

++	if (bfqq == bfqd->in_service_queue)

5535

++		__bfq_bfqd_reset_in_service(bfqd);

5536

++

5537

++	bfq_deactivate_entity(entity, requeue);

5538

++}

5539

++

5540

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5541

++{

5542

++	struct bfq_entity *entity = &bfqq->entity;

5543

++

5544

++	bfq_activate_entity(entity);

5545

++}

5546

++

5547

++/*

5548

++ * Called when the bfqq no longer has requests pending, remove it from

5549

++ * the service tree.

5550

++ */

5551

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5552

++			      int requeue)

5553

++{

5554

++	BUG_ON(!bfq_bfqq_busy(bfqq));

5555

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

5556

++

5557

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

5558

++

5559

++	bfq_clear_bfqq_busy(bfqq);

5560

++

5561

++	BUG_ON(bfqd->busy_queues == 0);

5562

++	bfqd->busy_queues--;

5563

++	if (bfqq->raising_coeff > 1)

5564

++		bfqd->raised_busy_queues--;

5565

++

5566

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

5567

++}

5568

++

5569

++/*

5570

++ * Called when an inactive queue receives a new request.

5571

++ */

5572

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5573

++{

5574

++	BUG_ON(bfq_bfqq_busy(bfqq));

5575

++	BUG_ON(bfqq == bfqd->in_service_queue);

5576

++

5577

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

5578

++

5579

++	bfq_activate_bfqq(bfqd, bfqq);

5580

++

5581

++	bfq_mark_bfqq_busy(bfqq);

5582

++	bfqd->busy_queues++;

5583

++	if (bfqq->raising_coeff > 1)

5584

++		bfqd->raised_busy_queues++;

5585

++}

5586

+diff --git a/block/bfq.h b/block/bfq.h

5587

+new file mode 100644

5588

+index 0000000..3ca8482

5589

+--- /dev/null

5590

++++ b/block/bfq.h

5591

+@@ -0,0 +1,622 @@

5592

++/*

5593

++ * BFQ-v7r2 for 3.14.0: data structures and common functions prototypes.

5594

++ *

5595

++ * Based on ideas and code from CFQ:

5596

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5597

++ *

5598

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5599

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5600

++ *

5601

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5602

++ */

5603

++

5604

++#ifndef _BFQ_H

5605

++#define _BFQ_H

5606

++

5607

++#include <linux/blktrace_api.h>

5608

++#include <linux/hrtimer.h>

5609

++#include <linux/ioprio.h>

5610

++#include <linux/rbtree.h>

5611

++

5612

++#define BFQ_IOPRIO_CLASSES	3

5613

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

5614

++

5615

++#define BFQ_MIN_WEIGHT	1

5616

++#define BFQ_MAX_WEIGHT	1000

5617

++

5618

++#define BFQ_DEFAULT_GRP_WEIGHT	10

5619

++#define BFQ_DEFAULT_GRP_IOPRIO	0

5620

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

5621

++

5622

++struct bfq_entity;

5623

++

5624

++/**

5625

++ * struct bfq_service_tree - per ioprio_class service tree.

5626

++ * @active: tree for active entities (i.e., those backlogged).

5627

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

5628

++ * @first_idle: idle entity with minimum F_i.

5629

++ * @last_idle: idle entity with maximum F_i.

5630

++ * @vtime: scheduler virtual time.

5631

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

5632

++ *

5633

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

5634

++ * ioprio_class has its own independent scheduler, and so its own

5635

++ * bfq_service_tree.  All the fields are protected by the queue lock

5636

++ * of the containing bfqd.

5637

++ */

5638

++struct bfq_service_tree {

5639

++	struct rb_root active;

5640

++	struct rb_root idle;

5641

++

5642

++	struct bfq_entity *first_idle;

5643

++	struct bfq_entity *last_idle;

5644

++

5645

++	u64 vtime;

5646

++	unsigned long wsum;

5647

++};

5648

++

5649

++/**

5650

++ * struct bfq_sched_data - multi-class scheduler.

5651

++ * @in_service_entity: entity under service.

5652

++ * @next_in_service: head-of-the-line entity in the scheduler.

5653

++ * @service_tree: array of service trees, one per ioprio_class.

5654

++ *

5655

++ * bfq_sched_data is the basic scheduler queue.  It supports three

5656

++ * ioprio_classes, and can be used either as a toplevel queue or as

5657

++ * an intermediate queue on a hierarchical setup.

5658

++ * @next_in_service points to the active entity of the sched_data

5659

++ * service trees that will be scheduled next.

5660

++ *

5661

++ * The supported ioprio_classes are the same as in CFQ, in descending

5662

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

5663

++ * Requests from higher priority queues are served before all the

5664

++ * requests from lower priority queues; among requests of the same

5665

++ * queue requests are served according to B-WF2Q+.

5666

++ * All the fields are protected by the queue lock of the containing bfqd.

5667

++ */

5668

++struct bfq_sched_data {

5669

++	struct bfq_entity *in_service_entity;

5670

++	struct bfq_entity *next_in_service;

5671

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

5672

++};

5673

++

5674

++/**

5675

++ * struct bfq_entity - schedulable entity.

5676

++ * @rb_node: service_tree member.

5677

++ * @on_st: flag, true if the entity is on a tree (either the active or

5678

++ *         the idle one of its service_tree).

5679

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

5680

++ * @start: B-WF2Q+ start timestamp (aka S_i).

5681

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

5682

++ * @min_start: minimum start time of the (active) subtree rooted at

5683

++ *             this entity; used for O(log N) lookups into active trees.

5684

++ * @service: service received during the last round of service.

5685

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

5686

++ * @weight: weight of the queue

5687

++ * @parent: parent entity, for hierarchical scheduling.

5688

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

5689

++ *                 associated scheduler queue, %NULL on leaf nodes.

5690

++ * @sched_data: the scheduler queue this entity belongs to.

5691

++ * @ioprio: the ioprio in use.

5692

++ * @new_weight: when a weight change is requested, the new weight value.

5693

++ * @orig_weight: original weight, used to implement weight boosting

5694

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

5695

++ * @ioprio_class: the ioprio_class in use.

5696

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

5697

++ *                    ioprio_class value.

5698

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

5699

++ *                  ioprio_class change.

5700

++ *

5701

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

5702

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

5703

++ * entity belongs to the sched_data of the parent group in the cgroup

5704

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

5705

++ * in @my_sched_data.

5706

++ *

5707

++ * Each entity stores independently its priority values; this would

5708

++ * allow different weights on different devices, but this

5709

++ * functionality is not exported to userspace by now.  Priorities and

5710

++ * weights are updated lazily, first storing the new values into the

5711

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

5712

++ * there is a transition in the entity state that allows the priority

5713

++ * update to take place the effective and the requested priority

5714

++ * values are synchronized.

5715

++ *

5716

++ * Unless cgroups are used, the weight value is calculated from the

5717

++ * ioprio to export the same interface as CFQ.  When dealing with

5718

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

5719

++ * time to consume their budget and have true sequential behavior, and

5720

++ * when there are no external factors breaking anticipation) the

5721

++ * relative weights at each level of the cgroups hierarchy should be

5722

++ * guaranteed.  All the fields are protected by the queue lock of the

5723

++ * containing bfqd.

5724

++ */

5725

++struct bfq_entity {

5726

++	struct rb_node rb_node;

5727

++

5728

++	int on_st;

5729

++

5730

++	u64 finish;

5731

++	u64 start;

5732

++

5733

++	struct rb_root *tree;

5734

++

5735

++	u64 min_start;

5736

++

5737

++	unsigned long service, budget;

5738

++	unsigned short weight, new_weight;

5739

++	unsigned short orig_weight;

5740

++

5741

++	struct bfq_entity *parent;

5742

++

5743

++	struct bfq_sched_data *my_sched_data;

5744

++	struct bfq_sched_data *sched_data;

5745

++

5746

++	unsigned short ioprio, new_ioprio;

5747

++	unsigned short ioprio_class, new_ioprio_class;

5748

++

5749

++	int ioprio_changed;

5750

++};

5751

++

5752

++struct bfq_group;

5753

++

5754

++/**

5755

++ * struct bfq_queue - leaf schedulable entity.

5756

++ * @ref: reference counter.

5757

++ * @bfqd: parent bfq_data.

5758

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

5759

++ *           one or more other queues.

5760

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

5761

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

5762

++ * @sort_list: sorted list of pending requests.

5763

++ * @next_rq: if fifo isn't expired, next request to serve.

5764

++ * @queued: nr of requests queued in @sort_list.

5765

++ * @allocated: currently allocated requests.

5766

++ * @meta_pending: pending metadata requests.

5767

++ * @fifo: fifo list of requests in sort_list.

5768

++ * @entity: entity representing this queue in the scheduler.

5769

++ * @max_budget: maximum budget allowed from the feedback mechanism.

5770

++ * @budget_timeout: budget expiration (in jiffies).

5771

++ * @dispatched: number of requests on the dispatch list or inside driver.

5772

++ * @org_ioprio: saved ioprio during boosted periods.

5773

++ * @flags: status flags.

5774

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

5775

++ * @seek_samples: number of seeks sampled

5776

++ * @seek_total: sum of the distances of the seeks sampled

5777

++ * @seek_mean: mean seek distance

5778

++ * @last_request_pos: position of the last request enqueued

5779

++ * @pid: pid of the process owning the queue, used for logging purposes.

5780

++ * @last_rais_start_finish: start time of the current weight-raising period if

5781

++ *                          the @bfq-queue is being weight-raised, otherwise

5782

++ *                          finish time of the last weight-raising period

5783

++ * @raising_cur_max_time: current max raising time for this queue

5784

++ * @soft_rt_next_start: minimum time instant such that, only if a new request

5785

++ *                      is enqueued after this time instant in an idle

5786

++ *                      @bfq_queue with no outstanding requests, then the

5787

++ *                      task associated with the queue it is deemed as soft

5788

++ *                      real-time (see the comments to the function

5789

++ *                      bfq_bfqq_softrt_next_start())

5790

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

5791

++ *                      idle to backlogged

5792

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

5793

++ *                           since the last transition from idle to backlogged

5794

++ *

5795

++ * A bfq_queue is a leaf request queue; it can be associated with an io_context

5796

++ * or more, if it is async or shared between cooperating processes. @cgroup

5797

++ * holds a reference to the cgroup, to be sure that it does not disappear while

5798

++ * a bfqq still references it (mostly to avoid races between request issuing and

5799

++ * task migration followed by cgroup destruction).

5800

++ * All the fields are protected by the queue lock of the containing bfqd.

5801

++ */

5802

++struct bfq_queue {

5803

++	atomic_t ref;

5804

++	struct bfq_data *bfqd;

5805

++

5806

++	/* fields for cooperating queues handling */

5807

++	struct bfq_queue *new_bfqq;

5808

++	struct rb_node pos_node;

5809

++	struct rb_root *pos_root;

5810

++

5811

++	struct rb_root sort_list;

5812

++	struct request *next_rq;

5813

++	int queued[2];

5814

++	int allocated[2];

5815

++	int meta_pending;

5816

++	struct list_head fifo;

5817

++

5818

++	struct bfq_entity entity;

5819

++

5820

++	unsigned long max_budget;

5821

++	unsigned long budget_timeout;

5822

++

5823

++	int dispatched;

5824

++

5825

++	unsigned short org_ioprio;

5826

++

5827

++	unsigned int flags;

5828

++

5829

++	struct list_head bfqq_list;

5830

++

5831

++	unsigned int seek_samples;

5832

++	u64 seek_total;

5833

++	sector_t seek_mean;

5834

++	sector_t last_request_pos;

5835

++

5836

++	pid_t pid;

5837

++

5838

++	/* weight-raising fields */

5839

++	unsigned long raising_cur_max_time;

5840

++	unsigned long soft_rt_next_start;

5841

++	unsigned long last_rais_start_finish;

5842

++	unsigned int raising_coeff;

5843

++	unsigned long last_idle_bklogged;

5844

++	unsigned long service_from_backlogged;

5845

++};

5846

++

5847

++/**

5848

++ * struct bfq_ttime - per process thinktime stats.

5849

++ * @ttime_total: total process thinktime

5850

++ * @ttime_samples: number of thinktime samples

5851

++ * @ttime_mean: average process thinktime

5852

++ */

5853

++struct bfq_ttime {

5854

++	unsigned long last_end_request;

5855

++

5856

++	unsigned long ttime_total;

5857

++	unsigned long ttime_samples;

5858

++	unsigned long ttime_mean;

5859

++};

5860

++

5861

++/**

5862

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

5863

++ * @icq: associated io_cq structure

5864

++ * @bfqq: array of two process queues, the sync and the async

5865

++ * @ttime: associated @bfq_ttime struct

5866

++ */

5867

++struct bfq_io_cq {

5868

++	struct io_cq icq; /* must be the first member */

5869

++	struct bfq_queue *bfqq[2];

5870

++	struct bfq_ttime ttime;

5871

++	int ioprio;

5872

++};

5873

++

5874

++/**

5875

++ * struct bfq_data - per device data structure.

5876

++ * @queue: request queue for the managed device.

5877

++ * @root_group: root bfq_group for the device.

5878

++ * @rq_pos_tree: rbtree sorted by next_request position,

5879

++ *		used when determining if two or more queues

5880

++ *		have interleaving requests (see bfq_close_cooperator).

5881

++ * @busy_queues: number of bfq_queues containing requests (including the

5882

++ *		 queue under service, even if it is idling).

5883

++ * @raised_busy_queues: number of weight-raised busy bfq_queues.

5884

++ * @queued: number of queued requests.

5885

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

5886

++ * @sync_flight: number of sync requests in the driver.

5887

++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

5888

++ *		      completed requests .

5889

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

5890

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

5891

++ * @budgets_assigned: number of budgets assigned.

5892

++ * @idle_slice_timer: timer set when idling for the next sequential request

5893

++ *                    from the queue under service.

5894

++ * @unplug_work: delayed work to restart dispatching on the request queue.

5895

++ * @in_service_queue: bfq_queue under service.

5896

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

5897

++ * @last_position: on-disk position of the last served request.

5898

++ * @last_budget_start: beginning of the last budget.

5899

++ * @last_idling_start: beginning of the last idle slice.

5900

++ * @peak_rate: peak transfer rate observed for a budget.

5901

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

5902

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

5903

++ * @group_list: list of all the bfq_groups active on the device.

5904

++ * @active_list: list of all the bfq_queues active on the device.

5905

++ * @idle_list: list of all the bfq_queues idle on the device.

5906

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

5907

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

5908

++ *                   requests are served in fifo order.

5909

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

5910

++ * @bfq_back_max: maximum allowed backward seek.

5911

++ * @bfq_slice_idle: maximum idling time.

5912

++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

5913

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

5914

++ *                           async queues.

5915

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

5916

++ *               to prevent seeky queues to impose long latencies to well

5917

++ *               behaved ones (this also implies that seeky queues cannot

5918

++ *               receive guarantees in the service domain; after a timeout

5919

++ *               they are charged for the whole allocated budget, to try

5920

++ *               to preserve a behavior reasonably fair among them, but

5921

++ *               without service-domain guarantees).

5922

++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

5923

++ *                            queue is multiplied

5924

++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

5925

++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

5926

++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

5927

++ *			       may be reactivated for a queue (in jiffies)

5928

++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

5929

++ *				     after which weight-raising may be

5930

++ *				     reactivated for an already busy queue

5931

++ *				     (in jiffies)

5932

++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

5933

++ *			         sectors per seconds

5934

++ * @RT_prod: cached value of the product R*T used for computing the maximum

5935

++ *	     duration of the weight raising automatically

5936

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

5937

++ *

5938

++ * All the fields are protected by the @queue lock.

5939

++ */

5940

++struct bfq_data {

5941

++	struct request_queue *queue;

5942

++

5943

++	struct bfq_group *root_group;

5944

++

5945

++	struct rb_root rq_pos_tree;

5946

++

5947

++	int busy_queues;

5948

++	int raised_busy_queues;

5949

++	int queued;

5950

++	int rq_in_driver;

5951

++	int sync_flight;

5952

++

5953

++	int max_rq_in_driver;

5954

++	int hw_tag_samples;

5955

++	int hw_tag;

5956

++

5957

++	int budgets_assigned;

5958

++

5959

++	struct timer_list idle_slice_timer;

5960

++	struct work_struct unplug_work;

5961

++

5962

++	struct bfq_queue *in_service_queue;

5963

++	struct bfq_io_cq *in_service_bic;

5964

++

5965

++	sector_t last_position;

5966

++

5967

++	ktime_t last_budget_start;

5968

++	ktime_t last_idling_start;

5969

++	int peak_rate_samples;

5970

++	u64 peak_rate;

5971

++	unsigned long bfq_max_budget;

5972

++

5973

++	struct hlist_head group_list;

5974

++	struct list_head active_list;

5975

++	struct list_head idle_list;

5976

++

5977

++	unsigned int bfq_quantum;

5978

++	unsigned int bfq_fifo_expire[2];

5979

++	unsigned int bfq_back_penalty;

5980

++	unsigned int bfq_back_max;

5981

++	unsigned int bfq_slice_idle;

5982

++	u64 bfq_class_idle_last_service;

5983

++

5984

++	unsigned int bfq_user_max_budget;

5985

++	unsigned int bfq_max_budget_async_rq;

5986

++	unsigned int bfq_timeout[2];

5987

++

5988

++	bool low_latency;

5989

++

5990

++	/* parameters of the low_latency heuristics */

5991

++	unsigned int bfq_raising_coeff;

5992

++	unsigned int bfq_raising_max_time;

5993

++	unsigned int bfq_raising_rt_max_time;

5994

++	unsigned int bfq_raising_min_idle_time;

5995

++	unsigned long bfq_raising_min_inter_arr_async;

5996

++	unsigned int bfq_raising_max_softrt_rate;

5997

++	u64 RT_prod;

5998

++

5999

++	struct bfq_queue oom_bfqq;

6000

++};

6001

++

6002

++enum bfqq_state_flags {

6003

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

6004

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6005

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6006

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6007

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6008

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

6009

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6010

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6011

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6012

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6013

++	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

6014

++};

6015

++

6016

++#define BFQ_BFQQ_FNS(name)						\

6017

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6018

++{									\

6019

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6020

++}									\

6021

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6022

++{									\

6023

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6024

++}									\

6025

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6026

++{									\

6027

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6028

++}

6029

++

6030

++BFQ_BFQQ_FNS(busy);

6031

++BFQ_BFQQ_FNS(wait_request);

6032

++BFQ_BFQQ_FNS(must_alloc);

6033

++BFQ_BFQQ_FNS(fifo_expire);

6034

++BFQ_BFQQ_FNS(idle_window);

6035

++BFQ_BFQQ_FNS(prio_changed);

6036

++BFQ_BFQQ_FNS(sync);

6037

++BFQ_BFQQ_FNS(budget_new);

6038

++BFQ_BFQQ_FNS(coop);

6039

++BFQ_BFQQ_FNS(split_coop);

6040

++BFQ_BFQQ_FNS(softrt_update);

6041

++#undef BFQ_BFQQ_FNS

6042

++

6043

++/* Logging facilities. */

6044

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6045

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6046

++

6047

++#define bfq_log(bfqd, fmt, args...) \

6048

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6049

++

6050

++/* Expiration reasons. */

6051

++enum bfqq_expiration {

6052

++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

6053

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6054

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6055

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6056

++};

6057

++

6058

++#ifdef CONFIG_CGROUP_BFQIO

6059

++/**

6060

++ * struct bfq_group - per (device, cgroup) data structure.

6061

++ * @entity: schedulable entity to insert into the parent group sched_data.

6062

++ * @sched_data: own sched_data, to contain child entities (they may be

6063

++ *              both bfq_queues and bfq_groups).

6064

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6065

++ *              list of the containing cgroup's bfqio_cgroup.

6066

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6067

++ *             of the groups active on the same device; used for cleanup.

6068

++ * @bfqd: the bfq_data for the device this group acts upon.

6069

++ * @async_bfqq: array of async queues for all the tasks belonging to

6070

++ *              the group, one queue per ioprio value per ioprio_class,

6071

++ *              except for the idle class that has only one queue.

6072

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6073

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6074

++ *             to avoid too many special cases during group creation/migration.

6075

++ *

6076

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6077

++ * there is a set of bfq_groups, each one collecting the lower-level

6078

++ * entities belonging to the group that are acting on the same device.

6079

++ *

6080

++ * Locking works as follows:

6081

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6082

++ *      via RCU from its readers.

6083

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

6084

++ *      from the readers.

6085

++ *    o All the other fields are protected by the @bfqd queue lock.

6086

++ */

6087

++struct bfq_group {

6088

++	struct bfq_entity entity;

6089

++	struct bfq_sched_data sched_data;

6090

++

6091

++	struct hlist_node group_node;

6092

++	struct hlist_node bfqd_node;

6093

++

6094

++	void *bfqd;

6095

++

6096

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6097

++	struct bfq_queue *async_idle_bfqq;

6098

++

6099

++	struct bfq_entity *my_entity;

6100

++};

6101

++

6102

++/**

6103

++ * struct bfqio_cgroup - bfq cgroup data structure.

6104

++ * @css: subsystem state for bfq in the containing cgroup.

6105

++ * @online: flag marked when the subsystem is inserted.

6106

++ * @weight: cgroup weight.

6107

++ * @ioprio: cgroup ioprio.

6108

++ * @ioprio_class: cgroup ioprio_class.

6109

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

6110

++ * @group_data: list containing the bfq_group belonging to this cgroup.

6111

++ *

6112

++ * @group_data is accessed using RCU, with @lock protecting the updates,

6113

++ * @ioprio and @ioprio_class are protected by @lock.

6114

++ */

6115

++struct bfqio_cgroup {

6116

++	struct cgroup_subsys_state css;

6117

++	bool online;

6118

++

6119

++	unsigned short weight, ioprio, ioprio_class;

6120

++

6121

++	spinlock_t lock;

6122

++	struct hlist_head group_data;

6123

++};

6124

++#else

6125

++struct bfq_group {

6126

++	struct bfq_sched_data sched_data;

6127

++

6128

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6129

++	struct bfq_queue *async_idle_bfqq;

6130

++};

6131

++#endif

6132

++

6133

++static inline struct bfq_service_tree *

6134

++bfq_entity_service_tree(struct bfq_entity *entity)

6135

++{

6136

++	struct bfq_sched_data *sched_data = entity->sched_data;

6137

++	unsigned int idx = entity->ioprio_class - 1;

6138

++

6139

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

6140

++	BUG_ON(sched_data == NULL);

6141

++

6142

++	return sched_data->service_tree + idx;

6143

++}

6144

++

6145

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

6146

++					    int is_sync)

6147

++{

6148

++	return bic->bfqq[!!is_sync];

6149

++}

6150

++

6151

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

6152

++				struct bfq_queue *bfqq, int is_sync)

6153

++{

6154

++	bic->bfqq[!!is_sync] = bfqq;

6155

++}

6156

++

6157

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

6158

++{

6159

++	return bic->icq.q->elevator->elevator_data;

6160

++}

6161

++

6162

++/**

6163

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

6164

++ * @ptr: a pointer to a bfqd.

6165

++ * @flags: storage for the flags to be saved.

6166

++ *

6167

++ * This function allows bfqg->bfqd to be protected by the

6168

++ * queue lock of the bfqd they reference; the pointer is dereferenced

6169

++ * under RCU, so the storage for bfqd is assured to be safe as long

6170

++ * as the RCU read side critical section does not end.  After the

6171

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

6172

++ * sure that no other writer accessed it.  If we raced with a writer,

6173

++ * the function returns NULL, with the queue unlocked, otherwise it

6174

++ * returns the dereferenced pointer, with the queue locked.

6175

++ */

6176

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

6177

++						   unsigned long *flags)

6178

++{

6179

++	struct bfq_data *bfqd;

6180

++

6181

++	rcu_read_lock();

6182

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

6183

++

6184

++	if (bfqd != NULL) {

6185

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

6186

++		if (*ptr == bfqd)

6187

++			goto out;

6188

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6189

++	}

6190

++

6191

++	bfqd = NULL;

6192

++out:

6193

++	rcu_read_unlock();

6194

++	return bfqd;

6195

++}

6196

++

6197

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

6198

++				       unsigned long *flags)

6199

++{

6200

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6201

++}

6202

++

6203

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

6204

++static void bfq_put_queue(struct bfq_queue *bfqq);

6205

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

6206

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

6207

++				       struct bfq_group *bfqg, int is_sync,

6208

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

6209

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

6210

++					 struct bfq_group *bfqg);

6211

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

6212

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

6213

++#endif

6214

+--

6215

+1.9.0

6216

+

6217

6218

Added: genpatches-2.6/trunk/3.14/5003_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r2-for-3.14.0.patch

6219

===================================================================

6220

--- genpatches-2.6/trunk/3.14/5003_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r2-for-3.14.0.patch	                        (rev 0)

6221

+++ genpatches-2.6/trunk/3.14/5003_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r2-for-3.14.0.patch	2014-04-01 12:32:06 UTC (rev 2724)

6222

@@ -0,0 +1,1038 @@

6223

+From 4fbeda28a90d7fccd05d28a89d9fc409b2344e0a Mon Sep 17 00:00:00 2001

6224

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

6225

+Date: Fri, 14 Feb 2014 12:52:49 +0100

6226

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r2 for

6227

+ 3.14.0

6228

+

6229

+A set of processes may happen  to  perform interleaved reads, i.e., requests

6230

+whose union would give rise to a  sequential read  pattern.  There are two

6231

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

6232

+data at a fixed distance from each other, while in the second case processes

6233

+may read variable-size chunks at  variable distances. The latter case occurs

6234

+for  example with  KVM, which  splits the  I/O generated  by the  guest into

6235

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

6236

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

6237

+available  process. CFQ  uses actual  queue merging  for the  first type of

6238

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

6239

+of the read requests  performed by the second type of  processes. In the end

6240

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

6241

+throughput with interleaved I/O.

6242

+

6243

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

6244

+sequential  read pattern  with both  types of  processes. The  main idea is

6245

+checking newly arrived requests against the next request of the active queue

6246

+both in case of actual request insert and in case of request merge. By doing

6247

+so, both the types of processes can be handled by just merging their queues.

6248

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

6249

+CFQ.

6250

+

6251

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

6252

+properly restoring the weight-raising state of  a queue when it gets back to

6253

+a non-merged state.

6254

+

6255

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

6256

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

6257

+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

6258

+---

6259

+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

6260

+ block/bfq-sched.c   |  28 ---

6261

+ block/bfq.h         |  20 +-

6262

+ 3 files changed, 476 insertions(+), 229 deletions(-)

6263

+

6264

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

6265

+index f5f71e4..0d3503d 100644

6266

+--- a/block/bfq-iosched.c

6267

++++ b/block/bfq-iosched.c

6268

+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

6269

+ 	return dur;

6270

+ }

6271

+

6272

++static inline void

6273

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

6274

++{

6275

++	if (bic->saved_idle_window)

6276

++		bfq_mark_bfqq_idle_window(bfqq);

6277

++	else

6278

++		bfq_clear_bfqq_idle_window(bfqq);

6279

++	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

6280

++		/*

6281

++		 * Start a weight raising period with the duration given by

6282

++		 * the raising_time_left snapshot.

6283

++		 */

6284

++		if (bfq_bfqq_busy(bfqq))

6285

++			bfqq->bfqd->raised_busy_queues++;

6286

++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

6287

++		bfqq->raising_cur_max_time = bic->raising_time_left;

6288

++		bfqq->last_rais_start_finish = jiffies;

6289

++		bfqq->entity.ioprio_changed = 1;

6290

++	}

6291

++	/*

6292

++	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

6293

++	 * getting confused about the queue's need of a weight-raising

6294

++	 * period.

6295

++	 */

6296

++	bic->raising_time_left = 0;

6297

++}

6298

++

6299

++/*

6300

++ * Must be called with the queue_lock held.

6301

++ */

6302

++static int bfqq_process_refs(struct bfq_queue *bfqq)

6303

++{

6304

++	int process_refs, io_refs;

6305

++

6306

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

6307

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

6308

++	BUG_ON(process_refs < 0);

6309

++	return process_refs;

6310

++}

6311

++

6312

+ static void bfq_add_rq_rb(struct request *rq)

6313

+ {

6314

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

6315

+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

6316

+ 		if (!bfqd->low_latency)

6317

+ 			goto add_bfqq_busy;

6318

+

6319

++		if (bfq_bfqq_just_split(bfqq))

6320

++			goto set_ioprio_changed;

6321

++

6322

+ 		/*

6323

+-		 * If the queue is not being boosted and has been idle

6324

+-		 * for enough time, start a weight-raising period

6325

++		 * If the queue:

6326

++		 * - is not being boosted,

6327

++		 * - has been idle for enough time,

6328

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

6329

++		 *   shared "for its nature" or it is not shared and its

6330

++		 *   requests have not been redirected to a shared queue)

6331

++		 * start a weight-raising period.

6332

+ 		 */

6333

+-		if (old_raising_coeff == 1 &&

6334

+-		    (idle_for_long_time || soft_rt)) {

6335

++		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

6336

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

6337

+ 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

6338

+ 			if (idle_for_long_time)

6339

+ 				bfqq->raising_cur_max_time =

6340

+@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)

6341

+ 					bfqd->bfq_raising_rt_max_time;

6342

+ 			}

6343

+ 		}

6344

++set_ioprio_changed:

6345

+ 		if (old_raising_coeff != bfqq->raising_coeff)

6346

+ 			entity->ioprio_changed = 1;

6347

+ add_bfqq_busy:

6348

+@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

6349

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

6350

+ }

6351

+

6352

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

6353

+-			   struct bio *bio)

6354

+-{

6355

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

6356

+-	struct bfq_io_cq *bic;

6357

+-	struct bfq_queue *bfqq;

6358

+-

6359

+-	/*

6360

+-	 * Disallow merge of a sync bio into an async request.

6361

+-	 */

6362

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

6363

+-		return 0;

6364

+-

6365

+-	/*

6366

+-	 * Lookup the bfqq that this bio will be queued with. Allow

6367

+-	 * merge only if rq is queued there.

6368

+-	 * Queue lock is held here.

6369

+-	 */

6370

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

6371

+-	if (bic == NULL)

6372

+-		return 0;

6373

+-

6374

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

6375

+-	return bfqq == RQ_BFQQ(rq);

6376

+-}

6377

+-

6378

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

6379

+-				       struct bfq_queue *bfqq)

6380

+-{

6381

+-	if (bfqq != NULL) {

6382

+-		bfq_mark_bfqq_must_alloc(bfqq);

6383

+-		bfq_mark_bfqq_budget_new(bfqq);

6384

+-		bfq_clear_bfqq_fifo_expire(bfqq);

6385

+-

6386

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

6387

+-

6388

+-		bfq_log_bfqq(bfqd, bfqq,

6389

+-			     "set_in_service_queue, cur-budget = %lu",

6390

+-			     bfqq->entity.budget);

6391

+-	}

6392

+-

6393

+-	bfqd->in_service_queue = bfqq;

6394

+-}

6395

+-

6396

+-/*

6397

+- * Get and set a new queue for service.

6398

+- */

6399

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

6400

+-						  struct bfq_queue *bfqq)

6401

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

6402

+ {

6403

+-	if (!bfqq)

6404

+-		bfqq = bfq_get_next_queue(bfqd);

6405

++	if (request)

6406

++		return blk_rq_pos(io_struct);

6407

+ 	else

6408

+-		bfq_get_next_queue_forced(bfqd, bfqq);

6409

+-

6410

+-	__bfq_set_in_service_queue(bfqd, bfqq);

6411

+-	return bfqq;

6412

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

6413

+ }

6414

+

6415

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

6416

+-					  struct request *rq)

6417

++static inline sector_t bfq_dist_from(sector_t pos1,

6418

++				     sector_t pos2)

6419

+ {

6420

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

6421

+-		return blk_rq_pos(rq) - bfqd->last_position;

6422

++	if (pos1 >= pos2)

6423

++		return pos1 - pos2;

6424

+ 	else

6425

+-		return bfqd->last_position - blk_rq_pos(rq);

6426

++		return pos2 - pos1;

6427

+ }

6428

+

6429

+-/*

6430

+- * Return true if bfqq has no request pending and rq is close enough to

6431

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

6432

+- * bfqq->next_rq

6433

+- */

6434

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

6435

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

6436

++					 sector_t sector)

6437

+ {

6438

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

6439

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

6440

++	       BFQQ_SEEK_THR;

6441

+ }

6442

+

6443

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6444

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

6445

+ {

6446

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

6447

+ 	struct rb_node *parent, *node;

6448

+ 	struct bfq_queue *__bfqq;

6449

+-	sector_t sector = bfqd->last_position;

6450

+

6451

+ 	if (RB_EMPTY_ROOT(root))

6452

+ 		return NULL;

6453

+@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6454

+ 	 * position).

6455

+ 	 */

6456

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

6457

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

6458

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

6459

+ 		return __bfqq;

6460

+

6461

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

6462

+@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6463

+ 		return NULL;

6464

+

6465

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

6466

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

6467

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

6468

+ 		return __bfqq;

6469

+

6470

+ 	return NULL;

6471

+@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6472

+ /*

6473

+  * bfqd - obvious

6474

+  * cur_bfqq - passed in so that we don't decide that the current queue

6475

+- *            is closely cooperating with itself.

6476

+- *

6477

+- * We are assuming that cur_bfqq has dispatched at least one request,

6478

+- * and that bfqd->last_position reflects a position on the disk associated

6479

+- * with the I/O issued by cur_bfqq.

6480

++ *            is closely cooperating with itself

6481

++ * sector - used as a reference point to search for a close queue

6482

+  */

6483

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6484

+-					      struct bfq_queue *cur_bfqq)

6485

++					      struct bfq_queue *cur_bfqq,

6486

++					      sector_t sector)

6487

+ {

6488

+ 	struct bfq_queue *bfqq;

6489

+

6490

+@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6491

+ 	 * working closely on the same area of the disk. In that case,

6492

+ 	 * we can group them together and don't waste time idling.

6493

+ 	 */

6494

+-	bfqq = bfqq_close(bfqd);

6495

++	bfqq = bfqq_close(bfqd, sector);

6496

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

6497

+ 		return NULL;

6498

+

6499

+@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6500

+ 	return bfqq;

6501

+ }

6502

+

6503

++static struct bfq_queue *

6504

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6505

++{

6506

++	int process_refs, new_process_refs;

6507

++	struct bfq_queue *__bfqq;

6508

++

6509

++	/*

6510

++	 * If there are no process references on the new_bfqq, then it is

6511

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

6512

++	 * may have dropped their last reference (not just their last process

6513

++	 * reference).

6514

++	 */

6515

++	if (!bfqq_process_refs(new_bfqq))

6516

++		return NULL;

6517

++

6518

++	/* Avoid a circular list and skip interim queue merges. */

6519

++	while ((__bfqq = new_bfqq->new_bfqq)) {

6520

++		if (__bfqq == bfqq)

6521

++			return NULL;

6522

++		new_bfqq = __bfqq;

6523

++	}

6524

++

6525

++	process_refs = bfqq_process_refs(bfqq);

6526

++	new_process_refs = bfqq_process_refs(new_bfqq);

6527

++	/*

6528

++	 * If the process for the bfqq has gone away, there is no

6529

++	 * sense in merging the queues.

6530

++	 */

6531

++	if (process_refs == 0 || new_process_refs == 0)

6532

++		return NULL;

6533

++

6534

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

6535

++		new_bfqq->pid);

6536

++

6537

++	/*

6538

++	 * Merging is just a redirection: the requests of the process owning

6539

++	 * one of the two queues are redirected to the other queue. The latter

6540

++	 * queue, in its turn, is set as shared if this is the first time that

6541

++	 * the requests of some process are redirected to it.

6542

++	 *

6543

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

6544

++	 * are in the context of the process owning bfqq, hence we have the

6545

++	 * io_cq of this process. So we can immediately configure this io_cq

6546

++	 * to redirect the requests of the process to new_bfqq.

6547

++	 *

6548

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

6549

++	 * io_cq of new_bfqq is not available, because, if the in-service queue

6550

++	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

6551

++	 * in-service queue.

6552

++	 * Redirecting the requests of the process owning bfqq to the currently

6553

++	 * in-service queue is in any case the best option, as we feed the

6554

++	 * in-service queue with new requests close to the last request served

6555

++	 * and, by doing so, hopefully increase the throughput.

6556

++	 */

6557

++	bfqq->new_bfqq = new_bfqq;

6558

++	atomic_add(process_refs, &new_bfqq->ref);

6559

++	return new_bfqq;

6560

++}

6561

++

6562

++/*

6563

++ * Attempt to schedule a merge of bfqq with the currently in-service queue or

6564

++ * with a close queue among the scheduled queues.

6565

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

6566

++ * structure otherwise.

6567

++ */

6568

++static struct bfq_queue *

6569

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6570

++		     void *io_struct, bool request)

6571

++{

6572

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

6573

++

6574

++	if (bfqq->new_bfqq)

6575

++		return bfqq->new_bfqq;

6576

++

6577

++	if (!io_struct)

6578

++		return NULL;

6579

++

6580

++	in_service_bfqq = bfqd->in_service_queue;

6581

++

6582

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

6583

++	    !bfqd->in_service_bic)

6584

++		goto check_scheduled;

6585

++

6586

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

6587

++		goto check_scheduled;

6588

++

6589

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

6590

++		goto check_scheduled;

6591

++

6592

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

6593

++		goto check_scheduled;

6594

++

6595

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

6596

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

6597

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

6598

++		if (new_bfqq != NULL)

6599

++			return new_bfqq; /* Merge with the in-service queue */

6600

++	}

6601

++

6602

++	/*

6603

++	 * Check whether there is a cooperator among currently scheduled

6604

++	 * queues. The only thing we need is that the bio/request is not

6605

++	 * NULL, as we need it to establish whether a cooperator exists.

6606

++	 */

6607

++check_scheduled:

6608

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

6609

++					bfq_io_struct_pos(io_struct, request));

6610

++	if (new_bfqq)

6611

++		return bfq_setup_merge(bfqq, new_bfqq);

6612

++

6613

++	return NULL;

6614

++}

6615

++

6616

++static inline void

6617

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

6618

++{

6619

++	/*

6620

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

6621

++	 * have already been redirected to a shared queue; both idle window

6622

++	 * and weight raising state have already been saved. Do nothing.

6623

++	 */

6624

++	if (bfqq->bic == NULL)

6625

++		return;

6626

++	if (bfqq->bic->raising_time_left)

6627

++		/*

6628

++		 * This is the queue of a just-started process, and would

6629

++		 * deserve weight raising: we set raising_time_left to the full

6630

++		 * weight-raising duration to trigger weight-raising when and

6631

++		 * if the queue is split and the first request of the queue

6632

++		 * is enqueued.

6633

++		 */

6634

++		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

6635

++	else if (bfqq->raising_coeff > 1) {

6636

++		unsigned long wrais_duration =

6637

++			jiffies - bfqq->last_rais_start_finish;

6638

++		/*

6639

++		 * It may happen that a queue's weight raising period lasts

6640

++		 * longer than its raising_cur_max_time, as weight raising is

6641

++		 * handled only when a request is enqueued or dispatched (it

6642

++		 * does not use any timer). If the weight raising period is

6643

++		 * about to end, don't save it.

6644

++		 */

6645

++		if (bfqq->raising_cur_max_time <= wrais_duration)

6646

++			bfqq->bic->raising_time_left = 0;

6647

++		else

6648

++			bfqq->bic->raising_time_left =

6649

++				bfqq->raising_cur_max_time - wrais_duration;

6650

++		/*

6651

++		 * The bfq_queue is becoming shared or the requests of the

6652

++		 * process owning the queue are being redirected to a shared

6653

++		 * queue. Stop the weight raising period of the queue, as in

6654

++		 * both cases it should not be owned by an interactive or soft

6655

++		 * real-time application.

6656

++		 */

6657

++		bfq_bfqq_end_raising(bfqq);

6658

++	} else

6659

++		bfqq->bic->raising_time_left = 0;

6660

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

6661

++}

6662

++

6663

++static inline void

6664

++bfq_get_bic_reference(struct bfq_queue *bfqq)

6665

++{

6666

++	/*

6667

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

6668

++	 * is about to begin using a shared bfq_queue.

6669

++	 */

6670

++	if (bfqq->bic)

6671

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

6672

++}

6673

++

6674

++static void

6675

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

6676

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6677

++{

6678

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

6679

++		(long unsigned)new_bfqq->pid);

6680

++	/* Save weight raising and idle window of the merged queues */

6681

++	bfq_bfqq_save_state(bfqq);

6682

++	bfq_bfqq_save_state(new_bfqq);

6683

++	/*

6684

++	 * Grab a reference to the bic, to prevent it from being destroyed

6685

++	 * before being possibly touched by a bfq_split_bfqq().

6686

++	 */

6687

++	bfq_get_bic_reference(bfqq);

6688

++	bfq_get_bic_reference(new_bfqq);

6689

++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

6690

++	bic_set_bfqq(bic, new_bfqq, 1);

6691

++	bfq_mark_bfqq_coop(new_bfqq);

6692

++	/*

6693

++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

6694

++	 * new_bfqq->bic to NULL. bfqq either:

6695

++	 * - does not belong to any bic any more, and hence bfqq->bic must

6696

++	 *   be set to NULL, or

6697

++	 * - is a queue whose owning bics have already been redirected to a

6698

++	 *   different queue, hence the queue is destined to not belong to any

6699

++	 *   bic soon and bfqq->bic is already NULL (therefore the next

6700

++	 *   assignment causes no harm).

6701

++	 */

6702

++	new_bfqq->bic = NULL;

6703

++	bfqq->bic = NULL;

6704

++	bfq_put_queue(bfqq);

6705

++}

6706

++

6707

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

6708

++			   struct bio *bio)

6709

++{

6710

++	struct bfq_data *bfqd = q->elevator->elevator_data;

6711

++	struct bfq_io_cq *bic;

6712

++	struct bfq_queue *bfqq, *new_bfqq;

6713

++

6714

++	/*

6715

++	 * Disallow merge of a sync bio into an async request.

6716

++	 */

6717

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

6718

++		return 0;

6719

++

6720

++	/*

6721

++	 * Lookup the bfqq that this bio will be queued with. Allow

6722

++	 * merge only if rq is queued there.

6723

++	 * Queue lock is held here.

6724

++	 */

6725

++	bic = bfq_bic_lookup(bfqd, current->io_context);

6726

++	if (bic == NULL)

6727

++		return 0;

6728

++

6729

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

6730

++	/*

6731

++	 * We take advantage of this function to perform an early merge

6732

++	 * of the queues of possible cooperating processes.

6733

++	 */

6734

++	if (bfqq != NULL) {

6735

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

6736

++		if (new_bfqq != NULL) {

6737

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

6738

++			/*

6739

++			 * If we get here, the bio will be queued in the shared

6740

++			 * queue, i.e., new_bfqq, so use new_bfqq to decide

6741

++			 * whether bio and rq can be merged.

6742

++			 */

6743

++			bfqq = new_bfqq;

6744

++		}

6745

++	}

6746

++

6747

++	return bfqq == RQ_BFQQ(rq);

6748

++}

6749

++

6750

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

6751

++				       struct bfq_queue *bfqq)

6752

++{

6753

++	if (bfqq != NULL) {

6754

++		bfq_mark_bfqq_must_alloc(bfqq);

6755

++		bfq_mark_bfqq_budget_new(bfqq);

6756

++		bfq_clear_bfqq_fifo_expire(bfqq);

6757

++

6758

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

6759

++

6760

++		bfq_log_bfqq(bfqd, bfqq,

6761

++			     "set_in_service_queue, cur-budget = %lu",

6762

++			     bfqq->entity.budget);

6763

++	}

6764

++

6765

++	bfqd->in_service_queue = bfqq;

6766

++}

6767

++

6768

++/*

6769

++ * Get and set a new queue for service.

6770

++ */

6771

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

6772

++{

6773

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

6774

++

6775

++	__bfq_set_in_service_queue(bfqd, bfqq);

6776

++	return bfqq;

6777

++}

6778

++

6779

+ /*

6780

+  * If enough samples have been computed, return the current max budget

6781

+  * stored in bfqd, which is dynamically updated according to the

6782

+@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

6783

+ 	return rq;

6784

+ }

6785

+

6786

+-/*

6787

+- * Must be called with the queue_lock held.

6788

+- */

6789

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

6790

+-{

6791

+-	int process_refs, io_refs;

6792

+-

6793

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

6794

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

6795

+-	BUG_ON(process_refs < 0);

6796

+-	return process_refs;

6797

+-}

6798

+-

6799

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6800

+-{

6801

+-	int process_refs, new_process_refs;

6802

+-	struct bfq_queue *__bfqq;

6803

+-

6804

+-	/*

6805

+-	 * If there are no process references on the new_bfqq, then it is

6806

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

6807

+-	 * may have dropped their last reference (not just their last process

6808

+-	 * reference).

6809

+-	 */

6810

+-	if (!bfqq_process_refs(new_bfqq))

6811

+-		return;

6812

+-

6813

+-	/* Avoid a circular list and skip interim queue merges. */

6814

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

6815

+-		if (__bfqq == bfqq)

6816

+-			return;

6817

+-		new_bfqq = __bfqq;

6818

+-	}

6819

+-

6820

+-	process_refs = bfqq_process_refs(bfqq);

6821

+-	new_process_refs = bfqq_process_refs(new_bfqq);

6822

+-	/*

6823

+-	 * If the process for the bfqq has gone away, there is no

6824

+-	 * sense in merging the queues.

6825

+-	 */

6826

+-	if (process_refs == 0 || new_process_refs == 0)

6827

+-		return;

6828

+-

6829

+-	/*

6830

+-	 * Merge in the direction of the lesser amount of work.

6831

+-	 */

6832

+-	if (new_process_refs >= process_refs) {

6833

+-		bfqq->new_bfqq = new_bfqq;

6834

+-		atomic_add(process_refs, &new_bfqq->ref);

6835

+-	} else {

6836

+-		new_bfqq->new_bfqq = bfqq;

6837

+-		atomic_add(new_process_refs, &bfqq->ref);

6838

+-	}

6839

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

6840

+-		new_bfqq->pid);

6841

+-}

6842

+-

6843

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

6844

+ {

6845

+ 	struct bfq_entity *entity = &bfqq->entity;

6846

+@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

6847

+  */

6848

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6849

+ {

6850

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

6851

++	struct bfq_queue *bfqq;

6852

+ 	struct request *next_rq;

6853

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

6854

+

6855

+@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6856

+

6857

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

6858

+

6859

+-	/*

6860

+-         * If another queue has a request waiting within our mean seek

6861

+-         * distance, let it run. The expire code will check for close

6862

+-         * cooperators and put the close queue at the front of the

6863

+-         * service tree. If possible, merge the expiring queue with the

6864

+-         * new bfqq.

6865

+-         */

6866

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

6867

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

6868

+-                bfq_setup_merge(bfqq, new_bfqq);

6869

+-

6870

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

6871

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

6872

+ 	    !bfq_bfqq_must_idle(bfqq))

6873

+@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6874

+ 				bfq_clear_bfqq_wait_request(bfqq);

6875

+ 				del_timer(&bfqd->idle_slice_timer);

6876

+ 			}

6877

+-			if (new_bfqq == NULL)

6878

+-				goto keep_queue;

6879

+-			else

6880

+-				goto expire;

6881

++			goto keep_queue;

6882

+ 		}

6883

+ 	}

6884

+

6885

+ 	/*

6886

+-	 * No requests pending.  If the in-service queue has no cooperator and

6887

+-	 * still has requests in flight (possibly waiting for a completion)

6888

+-	 * or is idling for a new request, then keep it.

6889

++	 * No requests pending.  If the in-service queue still has requests in

6890

++	 * flight (possibly waiting for a completion) or is idling for a new

6891

++	 * request, then keep it.

6892

+ 	 */

6893

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

6894

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

6895

++	if (timer_pending(&bfqd->idle_slice_timer) ||

6896

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

6897

+ 		bfqq = NULL;

6898

+ 		goto keep_queue;

6899

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

6900

+-		/*

6901

+-		 * Expiring the queue because there is a close cooperator,

6902

+-		 * cancel timer.

6903

+-		 */

6904

+-		bfq_clear_bfqq_wait_request(bfqq);

6905

+-		del_timer(&bfqd->idle_slice_timer);

6906

+ 	}

6907

+

6908

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

6909

+ expire:

6910

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

6911

+ new_queue:

6912

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

6913

++	bfqq = bfq_set_in_service_queue(bfqd);

6914

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

6915

+ 		bfqq != NULL ? bfqq->pid : 0);

6916

+ keep_queue:

6917

+@@ -1825,9 +2015,8 @@ keep_queue:

6918

+ static void bfq_update_raising_data(struct bfq_data *bfqd,

6919

+ 				    struct bfq_queue *bfqq)

6920

+ {

6921

++	struct bfq_entity *entity = &bfqq->entity;

6922

+ 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

6923

+-		struct bfq_entity *entity = &bfqq->entity;

6924

+-

6925

+ 		bfq_log_bfqq(bfqd, bfqq,

6926

+ 			"raising period dur %u/%u msec, "

6927

+ 			"old raising coeff %u, w %d(%d)",

6928

+@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

6929

+ 			"WARN: pending prio change");

6930

+ 		/*

6931

+ 		 * If too much time has elapsed from the beginning

6932

+-		 * of this weight-raising, stop it.

6933

++		 * of this weight-raising period, stop it.

6934

+ 		 */

6935

+ 		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

6936

+ 					   bfqq->raising_cur_max_time)) {

6937

+@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

6938

+ 				     jiffies_to_msecs(bfqq->

6939

+ 					raising_cur_max_time));

6940

+ 			bfq_bfqq_end_raising(bfqq);

6941

+-			__bfq_entity_update_weight_prio(

6942

+-				bfq_entity_service_tree(entity),

6943

+-				entity);

6944

+ 		}

6945

+ 	}

6946

++	/* Update weight both if it must be raised and if it must be lowered */

6947

++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

6948

++		__bfq_entity_update_weight_prio(

6949

++			bfq_entity_service_tree(entity),

6950

++			entity);

6951

+ }

6952

+

6953

+ /*

6954

+@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)

6955

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

6956

+

6957

+ 	bic->ttime.last_end_request = jiffies;

6958

++	/*

6959

++	 * A newly created bic indicates that the process has just

6960

++	 * started doing I/O, and is probably mapping into memory its

6961

++	 * executable and libraries: it definitely needs weight raising.

6962

++	 * There is however the possibility that the process performs,

6963

++	 * for a while, I/O close to some other process. EQM intercepts

6964

++	 * this behavior and may merge the queue corresponding to the

6965

++	 * process  with some other queue, BEFORE the weight of the queue

6966

++	 * is raised. Merged queues are not weight-raised (they are assumed

6967

++	 * to belong to processes that benefit only from high throughput).

6968

++	 * If the merge is basically the consequence of an accident, then

6969

++	 * the queue will be split soon and will get back its old weight.

6970

++	 * It is then important to write down somewhere that this queue

6971

++	 * does need weight raising, even if it did not make it to get its

6972

++	 * weight raised before being merged. To this purpose, we overload

6973

++	 * the field raising_time_left and assign 1 to it, to mark the queue

6974

++	 * as needing weight raising.

6975

++	 */

6976

++	bic->raising_time_left = 1;

6977

+ }

6978

+

6979

+ static void bfq_exit_icq(struct io_cq *icq)

6980

+@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)

6981

+ 	}

6982

+

6983

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

6984

++		/*

6985

++		 * If the bic is using a shared queue, put the reference

6986

++		 * taken on the io_context when the bic started using a

6987

++		 * shared bfq_queue.

6988

++		 */

6989

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

6990

++			put_io_context(icq->ioc);

6991

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

6992

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

6993

+ 	}

6994

+@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

6995

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

6996

+ 		return;

6997

+

6998

++	/* Idle window just restored, statistics are meaningless. */

6999

++	if (bfq_bfqq_just_split(bfqq))

7000

++		return;

7001

++

7002

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

7003

+

7004

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

7005

+@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7006

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

7007

+ 	    !BFQQ_SEEKY(bfqq))

7008

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

7009

++	bfq_clear_bfqq_just_split(bfqq);

7010

+

7011

+ 	bfq_log_bfqq(bfqd, bfqq,

7012

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

7013

+@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7014

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

7015

+ {

7016

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

7017

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7018

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

7019

+

7020

+ 	assert_spin_locked(bfqd->queue->queue_lock);

7021

++

7022

++	/*

7023

++	 * An unplug may trigger a requeue of a request from the device

7024

++	 * driver: make sure we are in process context while trying to

7025

++	 * merge two bfq_queues.

7026

++	 */

7027

++	if (!in_interrupt()) {

7028

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

7029

++		if (new_bfqq != NULL) {

7030

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

7031

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

7032

++			/*

7033

++			 * Release the request's reference to the old bfqq

7034

++			 * and make sure one is taken to the shared queue.

7035

++			 */

7036

++			new_bfqq->allocated[rq_data_dir(rq)]++;

7037

++			bfqq->allocated[rq_data_dir(rq)]--;

7038

++			atomic_inc(&new_bfqq->ref);

7039

++			bfq_put_queue(bfqq);

7040

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

7041

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

7042

++						bfqq, new_bfqq);

7043

++			rq->elv.priv[1] = new_bfqq;

7044

++			bfqq = new_bfqq;

7045

++		}

7046

++	}

7047

++

7048

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

7049

+

7050

+ 	bfq_add_rq_rb(rq);

7051

+

7052

++	/*

7053

++	 * Here a newly-created bfq_queue has already started a weight-raising

7054

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

7055

++	 * from assigning it a full weight-raising period. See the detailed

7056

++	 * comments about this field in bfq_init_icq().

7057

++	 */

7058

++	if (bfqq->bic != NULL)

7059

++		bfqq->bic->raising_time_left = 0;

7060

+ 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

7061

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

7062

+

7063

+@@ -2663,18 +2920,6 @@ static void bfq_put_request(struct request *rq)

7064

+ 	}

7065

+ }

7066

+

7067

+-static struct bfq_queue *

7068

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7069

+-		struct bfq_queue *bfqq)

7070

+-{

7071

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7072

+-		(long unsigned)bfqq->new_bfqq->pid);

7073

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

7074

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

7075

+-	bfq_put_queue(bfqq);

7076

+-	return bic_to_bfqq(bic, 1);

7077

+-}

7078

+-

7079

+ /*

7080

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

7081

+  * was the last process referring to said bfqq.

7082

+@@ -2683,6 +2928,9 @@ static struct bfq_queue *

7083

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

7084

+ {

7085

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

7086

++

7087

++	put_io_context(bic->icq.ioc);

7088

++

7089

+ 	if (bfqq_process_refs(bfqq) == 1) {

7090

+ 		bfqq->pid = current->pid;

7091

+ 		bfq_clear_bfqq_coop(bfqq);

7092

+@@ -2711,6 +2959,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

7093

+ 	struct bfq_queue *bfqq;

7094

+ 	struct bfq_group *bfqg;

7095

+ 	unsigned long flags;

7096

++	bool split = false;

7097

+

7098

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

7099

+

7100

+@@ -2729,24 +2978,14 @@ new_queue:

7101

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

7102

+ 		bic_set_bfqq(bic, bfqq, is_sync);

7103

+ 	} else {

7104

+-		/*

7105

+-		 * If the queue was seeky for too long, break it apart.

7106

+-		 */

7107

++		/* If the queue was seeky for too long, break it apart. */

7108

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

7109

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

7110

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

7111

++			split = true;

7112

+ 			if (!bfqq)

7113

+ 				goto new_queue;

7114

+ 		}

7115

+-

7116

+-		/*

7117

+-		 * Check to see if this queue is scheduled to merge with

7118

+-		 * another closely cooperating queue. The merging of queues

7119

+-		 * happens here as it must be done in process context.

7120

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

7121

+-		 */

7122

+-		if (bfqq->new_bfqq != NULL)

7123

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

7124

+ 	}

7125

+

7126

+ 	bfqq->allocated[rw]++;

7127

+@@ -2757,6 +2996,26 @@ new_queue:

7128

+ 	rq->elv.priv[0] = bic;

7129

+ 	rq->elv.priv[1] = bfqq;

7130

+

7131

++	/*

7132

++	 * If a bfq_queue has only one process reference, it is owned

7133

++	 * by only one bfq_io_cq: we can set the bic field of the

7134

++	 * bfq_queue to the address of that structure. Also, if the

7135

++	 * queue has just been split, mark a flag so that the

7136

++	 * information is available to the other scheduler hooks.

7137

++	 */

7138

++	if (bfqq_process_refs(bfqq) == 1) {

7139

++		bfqq->bic = bic;

7140

++		if (split) {

7141

++			bfq_mark_bfqq_just_split(bfqq);

7142

++			/*

7143

++			 * If the queue has just been split from a shared queue,

7144

++			 * restore the idle window and the possible weight

7145

++			 * raising period.

7146

++			 */

7147

++			bfq_bfqq_resume_state(bfqq, bic);

7148

++		}

7149

++	}

7150

++

7151

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

7152

+

7153

+ 	return 0;

7154

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

7155

+index 999b475..e54ea33 100644

7156

+--- a/block/bfq-sched.c

7157

++++ b/block/bfq-sched.c

7158

+@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

7159

+ 	return bfqq;

7160

+ }

7161

+

7162

+-/*

7163

+- * Forced extraction of the given queue.

7164

+- */

7165

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

7166

+-				      struct bfq_queue *bfqq)

7167

+-{

7168

+-	struct bfq_entity *entity;

7169

+-	struct bfq_sched_data *sd;

7170

+-

7171

+-	BUG_ON(bfqd->in_service_queue != NULL);

7172

+-

7173

+-	entity = &bfqq->entity;

7174

+-	/*

7175

+-	 * Bubble up extraction/update from the leaf to the root.

7176

+-	*/

7177

+-	for_each_entity(entity) {

7178

+-		sd = entity->sched_data;

7179

+-		bfq_update_budget(entity);

7180

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

7181

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

7182

+-		sd->active_entity = entity;

7183

+-		sd->next_active = NULL;

7184

+-		entity->service = 0;

7185

+-	}

7186

+-

7187

+-	return;

7188

+-}

7189

+-

7190

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

7191

+ {

7192

+ 	if (bfqd->in_service_bic != NULL) {

7193

+diff --git a/block/bfq.h b/block/bfq.h

7194

+index 3ca8482..c278796 100644

7195

+--- a/block/bfq.h

7196

++++ b/block/bfq.h

7197

+@@ -200,6 +200,8 @@ struct bfq_group;

7198

+  *                      idle to backlogged

7199

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

7200

+  *                           since the last transition from idle to backlogged

7201

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

7202

++ *	 queue is shared

7203

+  *

7204

+  * A bfq_queue is a leaf request queue; it can be associated with an io_context

7205

+  * or more, if it is async or shared between cooperating processes. @cgroup

7206

+@@ -243,6 +245,7 @@ struct bfq_queue {

7207

+ 	sector_t last_request_pos;

7208

+

7209

+ 	pid_t pid;

7210

++	struct bfq_io_cq *bic;

7211

+

7212

+ 	/* weight-raising fields */

7213

+ 	unsigned long raising_cur_max_time;

7214

+@@ -272,12 +275,23 @@ struct bfq_ttime {

7215

+  * @icq: associated io_cq structure

7216

+  * @bfqq: array of two process queues, the sync and the async

7217

+  * @ttime: associated @bfq_ttime struct

7218

++ * @raising_time_left: snapshot of the time left before weight raising ends

7219

++ *		       for the sync queue associated to this process; this

7220

++ *		       snapshot is taken to remember this value while the weight

7221

++ *		       raising is suspended because the queue is merged with a

7222

++ *		       shared queue, and is used to set @raising_cur_max_time

7223

++ *		       when the queue is split from the shared queue and its

7224

++ *		       weight is raised again

7225

++ * @saved_idle_window: same purpose as the previous field for the idle window

7226

+  */

7227

+ struct bfq_io_cq {

7228

+ 	struct io_cq icq; /* must be the first member */

7229

+ 	struct bfq_queue *bfqq[2];

7230

+ 	struct bfq_ttime ttime;

7231

+ 	int ioprio;

7232

++

7233

++	unsigned int raising_time_left;

7234

++	unsigned int saved_idle_window;

7235

+ };

7236

+

7237

+ /**

7238

+@@ -418,8 +432,9 @@ enum bfqq_state_flags {

7239

+ 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

7240

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

7241

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

7242

+-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

7243

+-	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

7244

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

7245

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

7246

++	BFQ_BFQQ_FLAG_softrt_update,	/* may need softrt-next-start update */

7247

+ };

7248

+

7249

+ #define BFQ_BFQQ_FNS(name)						\

7250

+@@ -446,6 +461,7 @@ BFQ_BFQQ_FNS(sync);

7251

+ BFQ_BFQQ_FNS(budget_new);

7252

+ BFQ_BFQQ_FNS(coop);

7253

+ BFQ_BFQQ_FNS(split_coop);

7254

++BFQ_BFQQ_FNS(just_split);

7255

+ BFQ_BFQQ_FNS(softrt_update);

7256

+ #undef BFQ_BFQQ_FNS

7257

+

7258

+--

7259

+1.9.0

7260

+

Gentoo Archives: gentoo-commits