[gentoo-commits] proj/linux-patches:4.7 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.7 commit in: /
Date:	Thu, 28 Jul 2016 00:04:09
Message-Id:	`1469664213.543cfadc9443b9cfdbfea73dfcd2b7eb82dec66e.mpagano@gentoo`

1

commit:     543cfadc9443b9cfdbfea73dfcd2b7eb82dec66e

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Thu Jul 28 00:03:33 2016 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Thu Jul 28 00:03:33 2016 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=543cfadc

7

8

BFQ patches for 4.7. See http://algogroup.unimore.it/people/paolo/disk_sched/patches/4.7.0-v8/

9

10

 0000_README                                        |   16 +

11

 ...oups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch |  103 +

12

 ...ntroduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 | 7097 ++++++++++++++++++++

13

 ...arly-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch | 1101 +++

14

 ...rn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 | 6361 ++++++++++++++++++

15

 5 files changed, 14678 insertions(+)

16

17

diff --git a/0000_README b/0000_README

18

index 0530209..1b5179e 100644

19

--- a/0000_README

20

+++ b/0000_README

21

@@ -67,6 +67,22 @@ Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

22

 From:   https://github.com/graysky2/kernel_gcc_patch/

23

 Desc:   Kernel patch enables gcc < v4.9 optimizations for additional CPUs.

24

25

+Patch:  5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch

26

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

27

+Desc:   BFQ v7r11 patch 1 for 4.7: Build, cgroups and kconfig bits

28

+

29

+Patch:  5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1

30

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

31

+Desc:   BFQ v7r11 patch 2 for 4.7: BFQ Scheduler

32

+

33

+Patch:  5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch

34

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

35

+Desc:   BFQ v7r11 patch 3 for 4.7: Early Queue Merge (EQM)

36

+

37

+Patch:  5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch2

38

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

39

+Desc:   BFQ v7r11 patch 4 for 4.7: Early Queue Merge (EQM)

40

+

41

 Patch:  5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch

42

 From:   https://github.com/graysky2/kernel_gcc_patch/

43

 Desc:   Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.

44

45

diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch

46

new file mode 100644

47

index 0000000..45d0b07

48

--- /dev/null

49

+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch

50

@@ -0,0 +1,103 @@

51

+From feb58b4dd1e8fd895f28ba4c759e92febe316cb2 Mon Sep 17 00:00:00 2001

52

+From: Paolo Valente <paolo.valente@×××××××.it>

53

+Date: Tue, 7 Apr 2015 13:39:12 +0200

54

+Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.7.0

55

+

56

+Update Kconfig.iosched and do the related Makefile changes to include

57

+kernel configuration options for BFQ. Also increase the number of

58

+policies supported by the blkio controller so that BFQ can add its

59

+own.

60

+

61

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

62

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

63

+---

64

+ block/Kconfig.iosched  | 32 ++++++++++++++++++++++++++++++++

65

+ block/Makefile         |  1 +

66

+ include/linux/blkdev.h |  2 +-

67

+ 3 files changed, 34 insertions(+), 1 deletion(-)

68

+

69

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

70

+index 421bef9..0ee5f0f 100644

71

+--- a/block/Kconfig.iosched

72

++++ b/block/Kconfig.iosched

73

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

74

+ 	---help---

75

+ 	  Enable group IO scheduling in CFQ.

76

+

77

++config IOSCHED_BFQ

78

++	tristate "BFQ I/O scheduler"

79

++	default n

80

++	---help---

81

++	  The BFQ I/O scheduler tries to distribute bandwidth among

82

++	  all processes according to their weights.

83

++	  It aims at distributing the bandwidth as desired, independently of

84

++	  the disk parameters and with any workload. It also tries to

85

++	  guarantee low latency to interactive and soft real-time

86

++	  applications. If compiled built-in (saying Y here), BFQ can

87

++	  be configured to support hierarchical scheduling.

88

++

89

++config CGROUP_BFQIO

90

++	bool "BFQ hierarchical scheduling support"

91

++	depends on CGROUPS && IOSCHED_BFQ=y

92

++	default n

93

++	---help---

94

++	  Enable hierarchical scheduling in BFQ, using the cgroups

95

++	  filesystem interface.  The name of the subsystem will be

96

++	  bfqio.

97

++

98

+ choice

99

+ 	prompt "Default I/O scheduler"

100

+ 	default DEFAULT_CFQ

101

+@@ -52,6 +73,16 @@ choice

102

+ 	config DEFAULT_CFQ

103

+ 		bool "CFQ" if IOSCHED_CFQ=y

104

+

105

++	config DEFAULT_BFQ

106

++		bool "BFQ" if IOSCHED_BFQ=y

107

++		help

108

++		  Selects BFQ as the default I/O scheduler which will be

109

++		  used by default for all block devices.

110

++		  The BFQ I/O scheduler aims at distributing the bandwidth

111

++		  as desired, independently of the disk parameters and with

112

++		  any workload. It also tries to guarantee low latency to

113

++		  interactive and soft real-time applications.

114

++

115

+ 	config DEFAULT_NOOP

116

+ 		bool "No-op"

117

+

118

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

119

+ 	string

120

+ 	default "deadline" if DEFAULT_DEADLINE

121

+ 	default "cfq" if DEFAULT_CFQ

122

++	default "bfq" if DEFAULT_BFQ

123

+ 	default "noop" if DEFAULT_NOOP

124

+

125

+ endmenu

126

+diff --git a/block/Makefile b/block/Makefile

127

+index 9eda232..4a36683 100644

128

+--- a/block/Makefile

129

++++ b/block/Makefile

130

+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

131

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

132

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

133

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

134

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

135

+

136

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

137

+ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o

138

+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

139

+index 3d9cf32..8d862a0 100644

140

+--- a/include/linux/blkdev.h

141

++++ b/include/linux/blkdev.h

142

+@@ -45,7 +45,7 @@ struct pr_ops;

143

+  * Maximum number of blkcg policies allowed to be registered concurrently.

144

+  * Defined here to simplify include dependency.

145

+  */

146

+-#define BLKCG_MAX_POLS		2

147

++#define BLKCG_MAX_POLS		3

148

+

149

+ struct request;

150

+ typedef void (rq_end_io_fn)(struct request *, int);

151

+--

152

+1.9.1

153

+

154

155

diff --git a/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1

156

new file mode 100644

157

index 0000000..8a67a4b

158

--- /dev/null

159

+++ b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1

160

@@ -0,0 +1,7097 @@

161

+From 1f07b3f666e6da78d10e62cfb9696242e5b3005e Mon Sep 17 00:00:00 2001

162

+From: Paolo Valente <paolo.valente@×××××××.it>

163

+Date: Thu, 9 May 2013 19:10:02 +0200

164

+Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.7.0

165

+

166

+The general structure is borrowed from CFQ, as much of the code for

167

+handling I/O contexts. Over time, several useful features have been

168

+ported from CFQ as well (details in the changelog in README.BFQ). A

169

+(bfq_)queue is associated to each task doing I/O on a device, and each

170

+time a scheduling decision has to be made a queue is selected and served

171

+until it expires.

172

+

173

+    - Slices are given in the service domain: tasks are assigned

174

+      budgets, measured in number of sectors. Once got the disk, a task

175

+      must however consume its assigned budget within a configurable

176

+      maximum time (by default, the maximum possible value of the

177

+      budgets is automatically computed to comply with this timeout).

178

+      This allows the desired latency vs "throughput boosting" tradeoff

179

+      to be set.

180

+

181

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

182

+      using an augmented rb-tree to take eligibility into account while

183

+      preserving an O(log N) overall complexity.

184

+

185

+    - A low-latency tunable is provided; if enabled, both interactive

186

+      and soft real-time applications are guaranteed a very low latency.

187

+

188

+    - Latency guarantees are preserved also in the presence of NCQ.

189

+

190

+    - Also with flash-based devices, a high throughput is achieved

191

+      while still preserving latency guarantees.

192

+

193

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

194

+      cooperating-queue-merging and the preemption mechanisms present

195

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

196

+      sequential read pattern, and hence a high throughput, with any

197

+      set of processes performing interleaved I/O over a contiguous

198

+      sequence of sectors.

199

+

200

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

201

+      interface.  Since each node has a full scheduler, each group can

202

+      be assigned its own weight.

203

+

204

+    - If the cgroups interface is not used, only I/O priorities can be

205

+      assigned to processes, with ioprio values mapped to weights

206

+      with the relation weight = IOPRIO_BE_NR - ioprio.

207

+

208

+    - ioprio classes are served in strict priority order, i.e., lower

209

+      priority queues are not served as long as there are higher

210

+      priority queues.  Among queues in the same class the bandwidth is

211

+      distributed in proportion to the weight of each queue. A very

212

+      thin extra bandwidth is however guaranteed to the Idle class, to

213

+      prevent it from starving.

214

+

215

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

216

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

217

+---

218

+ block/Kconfig.iosched |    6 +-

219

+ block/bfq-cgroup.c    | 1182 ++++++++++++++++

220

+ block/bfq-ioc.c       |   36 +

221

+ block/bfq-iosched.c   | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++

222

+ block/bfq-sched.c     | 1200 ++++++++++++++++

223

+ block/bfq.h           |  801 +++++++++++

224

+ 6 files changed, 6975 insertions(+), 4 deletions(-)

225

+ create mode 100644 block/bfq-cgroup.c

226

+ create mode 100644 block/bfq-ioc.c

227

+ create mode 100644 block/bfq-iosched.c

228

+ create mode 100644 block/bfq-sched.c

229

+ create mode 100644 block/bfq.h

230

+

231

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

232

+index 0ee5f0f..f78cd1a 100644

233

+--- a/block/Kconfig.iosched

234

++++ b/block/Kconfig.iosched

235

+@@ -51,14 +51,12 @@ config IOSCHED_BFQ

236

+ 	  applications. If compiled built-in (saying Y here), BFQ can

237

+ 	  be configured to support hierarchical scheduling.

238

+

239

+-config CGROUP_BFQIO

240

++config BFQ_GROUP_IOSCHED

241

+ 	bool "BFQ hierarchical scheduling support"

242

+ 	depends on CGROUPS && IOSCHED_BFQ=y

243

+ 	default n

244

+ 	---help---

245

+-	  Enable hierarchical scheduling in BFQ, using the cgroups

246

+-	  filesystem interface.  The name of the subsystem will be

247

+-	  bfqio.

248

++	  Enable hierarchical scheduling in BFQ, using the blkio controller.

249

+

250

+ choice

251

+ 	prompt "Default I/O scheduler"

252

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

253

+new file mode 100644

254

+index 0000000..8610cd6

255

+--- /dev/null

256

++++ b/block/bfq-cgroup.c

257

+@@ -0,0 +1,1182 @@

258

++/*

259

++ * BFQ: CGROUPS support.

260

++ *

261

++ * Based on ideas and code from CFQ:

262

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

263

++ *

264

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

265

++ *		      Paolo Valente <paolo.valente@×××××××.it>

266

++ *

267

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

268

++ *

269

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

270

++ * file.

271

++ */

272

++

273

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

274

++

275

++/* bfqg stats flags */

276

++enum bfqg_stats_flags {

277

++	BFQG_stats_waiting = 0,

278

++	BFQG_stats_idling,

279

++	BFQG_stats_empty,

280

++};

281

++

282

++#define BFQG_FLAG_FNS(name)						\

283

++static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\

284

++{									\

285

++	stats->flags |= (1 << BFQG_stats_##name);			\

286

++}									\

287

++static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\

288

++{									\

289

++	stats->flags &= ~(1 << BFQG_stats_##name);			\

290

++}									\

291

++static int bfqg_stats_##name(struct bfqg_stats *stats)		\

292

++{									\

293

++	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\

294

++}									\

295

++

296

++BFQG_FLAG_FNS(waiting)

297

++BFQG_FLAG_FNS(idling)

298

++BFQG_FLAG_FNS(empty)

299

++#undef BFQG_FLAG_FNS

300

++

301

++/* This should be called with the queue_lock held. */

302

++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)

303

++{

304

++	unsigned long long now;

305

++

306

++	if (!bfqg_stats_waiting(stats))

307

++		return;

308

++

309

++	now = sched_clock();

310

++	if (time_after64(now, stats->start_group_wait_time))

311

++		blkg_stat_add(&stats->group_wait_time,

312

++			      now - stats->start_group_wait_time);

313

++	bfqg_stats_clear_waiting(stats);

314

++}

315

++

316

++/* This should be called with the queue_lock held. */

317

++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,

318

++						 struct bfq_group *curr_bfqg)

319

++{

320

++	struct bfqg_stats *stats = &bfqg->stats;

321

++

322

++	if (bfqg_stats_waiting(stats))

323

++		return;

324

++	if (bfqg == curr_bfqg)

325

++		return;

326

++	stats->start_group_wait_time = sched_clock();

327

++	bfqg_stats_mark_waiting(stats);

328

++}

329

++

330

++/* This should be called with the queue_lock held. */

331

++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)

332

++{

333

++	unsigned long long now;

334

++

335

++	if (!bfqg_stats_empty(stats))

336

++		return;

337

++

338

++	now = sched_clock();

339

++	if (time_after64(now, stats->start_empty_time))

340

++		blkg_stat_add(&stats->empty_time,

341

++			      now - stats->start_empty_time);

342

++	bfqg_stats_clear_empty(stats);

343

++}

344

++

345

++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)

346

++{

347

++	blkg_stat_add(&bfqg->stats.dequeue, 1);

348

++}

349

++

350

++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)

351

++{

352

++	struct bfqg_stats *stats = &bfqg->stats;

353

++

354

++	if (blkg_rwstat_total(&stats->queued))

355

++		return;

356

++

357

++	/*

358

++	 * group is already marked empty. This can happen if bfqq got new

359

++	 * request in parent group and moved to this group while being added

360

++	 * to service tree. Just ignore the event and move on.

361

++	 */

362

++	if (bfqg_stats_empty(stats))

363

++		return;

364

++

365

++	stats->start_empty_time = sched_clock();

366

++	bfqg_stats_mark_empty(stats);

367

++}

368

++

369

++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)

370

++{

371

++	struct bfqg_stats *stats = &bfqg->stats;

372

++

373

++	if (bfqg_stats_idling(stats)) {

374

++		unsigned long long now = sched_clock();

375

++

376

++		if (time_after64(now, stats->start_idle_time))

377

++			blkg_stat_add(&stats->idle_time,

378

++				      now - stats->start_idle_time);

379

++		bfqg_stats_clear_idling(stats);

380

++	}

381

++}

382

++

383

++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)

384

++{

385

++	struct bfqg_stats *stats = &bfqg->stats;

386

++

387

++	stats->start_idle_time = sched_clock();

388

++	bfqg_stats_mark_idling(stats);

389

++}

390

++

391

++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)

392

++{

393

++	struct bfqg_stats *stats = &bfqg->stats;

394

++

395

++	blkg_stat_add(&stats->avg_queue_size_sum,

396

++		      blkg_rwstat_total(&stats->queued));

397

++	blkg_stat_add(&stats->avg_queue_size_samples, 1);

398

++	bfqg_stats_update_group_wait_time(stats);

399

++}

400

++

401

++static struct blkcg_policy blkcg_policy_bfq;

402

++

403

++/*

404

++ * blk-cgroup policy-related handlers

405

++ * The following functions help in converting between blk-cgroup

406

++ * internal structures and BFQ-specific structures.

407

++ */

408

++

409

++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)

410

++{

411

++	return pd ? container_of(pd, struct bfq_group, pd) : NULL;

412

++}

413

++

414

++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)

415

++{

416

++	return pd_to_blkg(&bfqg->pd);

417

++}

418

++

419

++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)

420

++{

421

++	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);

422

++	BUG_ON(!pd);

423

++	return pd_to_bfqg(pd);

424

++}

425

++

426

++/*

427

++ * bfq_group handlers

428

++ * The following functions help in navigating the bfq_group hierarchy

429

++ * by allowing to find the parent of a bfq_group or the bfq_group

430

++ * associated to a bfq_queue.

431

++ */

432

++

433

++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)

434

++{

435

++	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;

436

++

437

++	return pblkg ? blkg_to_bfqg(pblkg) : NULL;

438

++}

439

++

440

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)

441

++{

442

++	struct bfq_entity *group_entity = bfqq->entity.parent;

443

++

444

++	return group_entity ? container_of(group_entity, struct bfq_group,

445

++					   entity) :

446

++			      bfqq->bfqd->root_group;

447

++}

448

++

449

++/*

450

++ * The following two functions handle get and put of a bfq_group by

451

++ * wrapping the related blk-cgroup hooks.

452

++ */

453

++

454

++static void bfqg_get(struct bfq_group *bfqg)

455

++{

456

++	return blkg_get(bfqg_to_blkg(bfqg));

457

++}

458

++

459

++static void bfqg_put(struct bfq_group *bfqg)

460

++{

461

++	return blkg_put(bfqg_to_blkg(bfqg));

462

++}

463

++

464

++static void bfqg_stats_update_io_add(struct bfq_group *bfqg,

465

++				     struct bfq_queue *bfqq,

466

++				     int rw)

467

++{

468

++	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);

469

++	bfqg_stats_end_empty_time(&bfqg->stats);

470

++	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))

471

++		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));

472

++}

473

++

474

++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)

475

++{

476

++	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);

477

++}

478

++

479

++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)

480

++{

481

++	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);

482

++}

483

++

484

++static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,

485

++					      uint64_t bytes, int rw)

486

++{

487

++	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);

488

++	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);

489

++	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);

490

++}

491

++

492

++static void bfqg_stats_update_completion(struct bfq_group *bfqg,

493

++			uint64_t start_time, uint64_t io_start_time, int rw)

494

++{

495

++	struct bfqg_stats *stats = &bfqg->stats;

496

++	unsigned long long now = sched_clock();

497

++

498

++	if (time_after64(now, io_start_time))

499

++		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);

500

++	if (time_after64(io_start_time, start_time))

501

++		blkg_rwstat_add(&stats->wait_time, rw,

502

++				io_start_time - start_time);

503

++}

504

++

505

++/* @stats = 0 */

506

++static void bfqg_stats_reset(struct bfqg_stats *stats)

507

++{

508

++	if (!stats)

509

++		return;

510

++

511

++	/* queued stats shouldn't be cleared */

512

++	blkg_rwstat_reset(&stats->service_bytes);

513

++	blkg_rwstat_reset(&stats->serviced);

514

++	blkg_rwstat_reset(&stats->merged);

515

++	blkg_rwstat_reset(&stats->service_time);

516

++	blkg_rwstat_reset(&stats->wait_time);

517

++	blkg_stat_reset(&stats->time);

518

++	blkg_stat_reset(&stats->unaccounted_time);

519

++	blkg_stat_reset(&stats->avg_queue_size_sum);

520

++	blkg_stat_reset(&stats->avg_queue_size_samples);

521

++	blkg_stat_reset(&stats->dequeue);

522

++	blkg_stat_reset(&stats->group_wait_time);

523

++	blkg_stat_reset(&stats->idle_time);

524

++	blkg_stat_reset(&stats->empty_time);

525

++}

526

++

527

++/* @to += @from */

528

++static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)

529

++{

530

++	if (!to || !from)

531

++		return;

532

++

533

++	/* queued stats shouldn't be cleared */

534

++	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);

535

++	blkg_rwstat_add_aux(&to->serviced, &from->serviced);

536

++	blkg_rwstat_add_aux(&to->merged, &from->merged);

537

++	blkg_rwstat_add_aux(&to->service_time, &from->service_time);

538

++	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);

539

++	blkg_stat_add_aux(&from->time, &from->time);

540

++	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);

541

++	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);

542

++	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);

543

++	blkg_stat_add_aux(&to->dequeue, &from->dequeue);

544

++	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);

545

++	blkg_stat_add_aux(&to->idle_time, &from->idle_time);

546

++	blkg_stat_add_aux(&to->empty_time, &from->empty_time);

547

++}

548

++

549

++/*

550

++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'

551

++ * recursive stats can still account for the amount used by this bfqg after

552

++ * it's gone.

553

++ */

554

++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)

555

++{

556

++	struct bfq_group *parent;

557

++

558

++	if (!bfqg) /* root_group */

559

++		return;

560

++

561

++	parent = bfqg_parent(bfqg);

562

++

563

++	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);

564

++

565

++	if (unlikely(!parent))

566

++		return;

567

++

568

++	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);

569

++	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);

570

++	bfqg_stats_reset(&bfqg->stats);

571

++	bfqg_stats_reset(&bfqg->dead_stats);

572

++}

573

++

574

++static void bfq_init_entity(struct bfq_entity *entity,

575

++			    struct bfq_group *bfqg)

576

++{

577

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

578

++

579

++	entity->weight = entity->new_weight;

580

++	entity->orig_weight = entity->new_weight;

581

++	if (bfqq) {

582

++		bfqq->ioprio = bfqq->new_ioprio;

583

++		bfqq->ioprio_class = bfqq->new_ioprio_class;

584

++		bfqg_get(bfqg);

585

++	}

586

++	entity->parent = bfqg->my_entity;

587

++	entity->sched_data = &bfqg->sched_data;

588

++}

589

++

590

++static void bfqg_stats_exit(struct bfqg_stats *stats)

591

++{

592

++	blkg_rwstat_exit(&stats->service_bytes);

593

++	blkg_rwstat_exit(&stats->serviced);

594

++	blkg_rwstat_exit(&stats->merged);

595

++	blkg_rwstat_exit(&stats->service_time);

596

++	blkg_rwstat_exit(&stats->wait_time);

597

++	blkg_rwstat_exit(&stats->queued);

598

++	blkg_stat_exit(&stats->sectors);

599

++	blkg_stat_exit(&stats->time);

600

++	blkg_stat_exit(&stats->unaccounted_time);

601

++	blkg_stat_exit(&stats->avg_queue_size_sum);

602

++	blkg_stat_exit(&stats->avg_queue_size_samples);

603

++	blkg_stat_exit(&stats->dequeue);

604

++	blkg_stat_exit(&stats->group_wait_time);

605

++	blkg_stat_exit(&stats->idle_time);

606

++	blkg_stat_exit(&stats->empty_time);

607

++}

608

++

609

++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)

610

++{

611

++	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||

612

++	    blkg_rwstat_init(&stats->serviced, gfp) ||

613

++	    blkg_rwstat_init(&stats->merged, gfp) ||

614

++	    blkg_rwstat_init(&stats->service_time, gfp) ||

615

++	    blkg_rwstat_init(&stats->wait_time, gfp) ||

616

++	    blkg_rwstat_init(&stats->queued, gfp) ||

617

++	    blkg_stat_init(&stats->sectors, gfp) ||

618

++	    blkg_stat_init(&stats->time, gfp) ||

619

++	    blkg_stat_init(&stats->unaccounted_time, gfp) ||

620

++	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||

621

++	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||

622

++	    blkg_stat_init(&stats->dequeue, gfp) ||

623

++	    blkg_stat_init(&stats->group_wait_time, gfp) ||

624

++	    blkg_stat_init(&stats->idle_time, gfp) ||

625

++	    blkg_stat_init(&stats->empty_time, gfp)) {

626

++		bfqg_stats_exit(stats);

627

++		return -ENOMEM;

628

++	}

629

++

630

++	return 0;

631

++}

632

++

633

++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)

634

++ {

635

++	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;

636

++ }

637

++

638

++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)

639

++{

640

++	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));

641

++}

642

++

643

++static void bfq_cpd_init(struct blkcg_policy_data *cpd)

644

++{

645

++	struct bfq_group_data *d = cpd_to_bfqgd(cpd);

646

++

647

++	d->weight = BFQ_DEFAULT_GRP_WEIGHT;

648

++}

649

++

650

++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)

651

++{

652

++	struct bfq_group *bfqg;

653

++

654

++	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);

655

++	if (!bfqg)

656

++		return NULL;

657

++

658

++	if (bfqg_stats_init(&bfqg->stats, gfp) ||

659

++	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {

660

++		kfree(bfqg);

661

++		return NULL;

662

++	}

663

++

664

++	return &bfqg->pd;

665

++}

666

++

667

++static void bfq_group_set_parent(struct bfq_group *bfqg,

668

++					struct bfq_group *parent)

669

++{

670

++	struct bfq_entity *entity;

671

++

672

++	BUG_ON(!parent);

673

++	BUG_ON(!bfqg);

674

++	BUG_ON(bfqg == parent);

675

++

676

++	entity = &bfqg->entity;

677

++	entity->parent = parent->my_entity;

678

++	entity->sched_data = &parent->sched_data;

679

++}

680

++

681

++static void bfq_pd_init(struct blkg_policy_data *pd)

682

++{

683

++	struct blkcg_gq *blkg = pd_to_blkg(pd);

684

++	struct bfq_group *bfqg = blkg_to_bfqg(blkg);

685

++	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;

686

++	struct bfq_entity *entity = &bfqg->entity;

687

++	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);

688

++

689

++	entity->orig_weight = entity->weight = entity->new_weight = d->weight;

690

++	entity->my_sched_data = &bfqg->sched_data;

691

++	bfqg->my_entity = entity; /*

692

++				   * the root_group's will be set to NULL

693

++				   * in bfq_init_queue()

694

++				   */

695

++	bfqg->bfqd = bfqd;

696

++	bfqg->active_entities = 0;

697

++}

698

++

699

++static void bfq_pd_free(struct blkg_policy_data *pd)

700

++{

701

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

702

++

703

++	bfqg_stats_exit(&bfqg->stats);

704

++	bfqg_stats_exit(&bfqg->dead_stats);

705

++

706

++	return kfree(bfqg);

707

++}

708

++

709

++/* offset delta from bfqg->stats to bfqg->dead_stats */

710

++static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -

711

++					offsetof(struct bfq_group, stats);

712

++

713

++/* to be used by recursive prfill, sums live and dead stats recursively */

714

++static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)

715

++{

716

++	u64 sum = 0;

717

++

718

++	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

719

++	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

720

++				       off + dead_stats_off_delta);

721

++	return sum;

722

++}

723

++

724

++/* to be used by recursive prfill, sums live and dead rwstats recursively */

725

++static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,

726

++						       int off)

727

++{

728

++	struct blkg_rwstat a, b;

729

++

730

++	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

731

++	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

732

++				      off + dead_stats_off_delta);

733

++	blkg_rwstat_add_aux(&a, &b);

734

++	return a;

735

++}

736

++

737

++static void bfq_pd_reset_stats(struct blkg_policy_data *pd)

738

++{

739

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

740

++

741

++	bfqg_stats_reset(&bfqg->stats);

742

++	bfqg_stats_reset(&bfqg->dead_stats);

743

++}

744

++

745

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

746

++					      struct blkcg *blkcg)

747

++{

748

++	struct request_queue *q = bfqd->queue;

749

++	struct bfq_group *bfqg = NULL, *parent;

750

++	struct bfq_entity *entity = NULL;

751

++

752

++	assert_spin_locked(bfqd->queue->queue_lock);

753

++

754

++	/* avoid lookup for the common case where there's no blkcg */

755

++	if (blkcg == &blkcg_root) {

756

++		bfqg = bfqd->root_group;

757

++	} else {

758

++		struct blkcg_gq *blkg;

759

++

760

++		blkg = blkg_lookup_create(blkcg, q);

761

++		if (!IS_ERR(blkg))

762

++			bfqg = blkg_to_bfqg(blkg);

763

++		else /* fallback to root_group */

764

++			bfqg = bfqd->root_group;

765

++	}

766

++

767

++	BUG_ON(!bfqg);

768

++

769

++	/*

770

++	 * Update chain of bfq_groups as we might be handling a leaf group

771

++	 * which, along with some of its relatives, has not been hooked yet

772

++	 * to the private hierarchy of BFQ.

773

++	 */

774

++	entity = &bfqg->entity;

775

++	for_each_entity(entity) {

776

++		bfqg = container_of(entity, struct bfq_group, entity);

777

++		BUG_ON(!bfqg);

778

++		if (bfqg != bfqd->root_group) {

779

++			parent = bfqg_parent(bfqg);

780

++			if (!parent)

781

++				parent = bfqd->root_group;

782

++			BUG_ON(!parent);

783

++			bfq_group_set_parent(bfqg, parent);

784

++		}

785

++	}

786

++

787

++	return bfqg;

788

++}

789

++

790

++/**

791

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

792

++ * @bfqd: queue descriptor.

793

++ * @bfqq: the queue to move.

794

++ * @entity: @bfqq's entity.

795

++ * @bfqg: the group to move to.

796

++ *

797

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

798

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

799

++ *

800

++ * Must be called under the queue lock; the cgroup owning @bfqg must

801

++ * not disappear (by now this just means that we are called under

802

++ * rcu_read_lock()).

803

++ */

804

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

805

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

806

++{

807

++	int busy, resume;

808

++

809

++	busy = bfq_bfqq_busy(bfqq);

810

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

811

++

812

++	BUG_ON(resume && !entity->on_st);

813

++	BUG_ON(busy && !resume && entity->on_st &&

814

++	       bfqq != bfqd->in_service_queue);

815

++

816

++	if (busy) {

817

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

818

++

819

++		if (!resume)

820

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

821

++		else

822

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

823

++	} else if (entity->on_st)

824

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

825

++	bfqg_put(bfqq_group(bfqq));

826

++

827

++	/*

828

++	 * Here we use a reference to bfqg.  We don't need a refcounter

829

++	 * as the cgroup reference will not be dropped, so that its

830

++	 * destroy() callback will not be invoked.

831

++	 */

832

++	entity->parent = bfqg->my_entity;

833

++	entity->sched_data = &bfqg->sched_data;

834

++	bfqg_get(bfqg);

835

++

836

++	if (busy) {

837

++		if (resume)

838

++			bfq_activate_bfqq(bfqd, bfqq);

839

++	}

840

++

841

++	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)

842

++		bfq_schedule_dispatch(bfqd);

843

++}

844

++

845

++/**

846

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

847

++ * @bfqd: the queue descriptor.

848

++ * @bic: the bic to move.

849

++ * @blkcg: the blk-cgroup to move to.

850

++ *

851

++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller

852

++ * has to make sure that the reference to cgroup is valid across the call.

853

++ *

854

++ * NOTE: an alternative approach might have been to store the current

855

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

856

++ * time here, at the price of slightly more complex code.

857

++ */

858

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

859

++						struct bfq_io_cq *bic,

860

++						struct blkcg *blkcg)

861

++{

862

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

863

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

864

++	struct bfq_group *bfqg;

865

++	struct bfq_entity *entity;

866

++

867

++	lockdep_assert_held(bfqd->queue->queue_lock);

868

++

869

++	bfqg = bfq_find_alloc_group(bfqd, blkcg);

870

++	if (async_bfqq) {

871

++		entity = &async_bfqq->entity;

872

++

873

++		if (entity->sched_data != &bfqg->sched_data) {

874

++			bic_set_bfqq(bic, NULL, 0);

875

++			bfq_log_bfqq(bfqd, async_bfqq,

876

++				     "bic_change_group: %p %d",

877

++				     async_bfqq, atomic_read(&async_bfqq->ref));

878

++			bfq_put_queue(async_bfqq);

879

++		}

880

++	}

881

++

882

++	if (sync_bfqq) {

883

++		entity = &sync_bfqq->entity;

884

++		if (entity->sched_data != &bfqg->sched_data)

885

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

886

++	}

887

++

888

++	return bfqg;

889

++}

890

++

891

++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

892

++{

893

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

894

++	struct blkcg *blkcg;

895

++	struct bfq_group *bfqg = NULL;

896

++	uint64_t id;

897

++

898

++	rcu_read_lock();

899

++	blkcg = bio_blkcg(bio);

900

++	id = blkcg->css.serial_nr;

901

++	rcu_read_unlock();

902

++

903

++	/*

904

++	 * Check whether blkcg has changed.  The condition may trigger

905

++	 * spuriously on a newly created cic but there's no harm.

906

++	 */

907

++	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))

908

++		return;

909

++

910

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);

911

++	BUG_ON(!bfqg);

912

++	bic->blkcg_id = id;

913

++}

914

++

915

++/**

916

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

917

++ * @st: the service tree being flushed.

918

++ */

919

++static void bfq_flush_idle_tree(struct bfq_service_tree *st)

920

++{

921

++	struct bfq_entity *entity = st->first_idle;

922

++

923

++	for (; entity ; entity = st->first_idle)

924

++		__bfq_deactivate_entity(entity, 0);

925

++}

926

++

927

++/**

928

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

929

++ * @bfqd: the device data structure with the root group.

930

++ * @entity: the entity to move.

931

++ */

932

++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

933

++				     struct bfq_entity *entity)

934

++{

935

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

936

++

937

++	BUG_ON(!bfqq);

938

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

939

++	return;

940

++}

941

++

942

++/**

943

++ * bfq_reparent_active_entities - move to the root group all active

944

++ *                                entities.

945

++ * @bfqd: the device data structure with the root group.

946

++ * @bfqg: the group to move from.

947

++ * @st: the service tree with the entities.

948

++ *

949

++ * Needs queue_lock to be taken and reference to be valid over the call.

950

++ */

951

++static void bfq_reparent_active_entities(struct bfq_data *bfqd,

952

++					 struct bfq_group *bfqg,

953

++					 struct bfq_service_tree *st)

954

++{

955

++	struct rb_root *active = &st->active;

956

++	struct bfq_entity *entity = NULL;

957

++

958

++	if (!RB_EMPTY_ROOT(&st->active))

959

++		entity = bfq_entity_of(rb_first(active));

960

++

961

++	for (; entity ; entity = bfq_entity_of(rb_first(active)))

962

++		bfq_reparent_leaf_entity(bfqd, entity);

963

++

964

++	if (bfqg->sched_data.in_service_entity)

965

++		bfq_reparent_leaf_entity(bfqd,

966

++			bfqg->sched_data.in_service_entity);

967

++

968

++	return;

969

++}

970

++

971

++/**

972

++ * bfq_destroy_group - destroy @bfqg.

973

++ * @bfqg: the group being destroyed.

974

++ *

975

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

976

++ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic

977

++ */

978

++static void bfq_pd_offline(struct blkg_policy_data *pd)

979

++{

980

++	struct bfq_service_tree *st;

981

++	struct bfq_group *bfqg;

982

++	struct bfq_data *bfqd;

983

++	struct bfq_entity *entity;

984

++	int i;

985

++

986

++	BUG_ON(!pd);

987

++	bfqg = pd_to_bfqg(pd);

988

++	BUG_ON(!bfqg);

989

++	bfqd = bfqg->bfqd;

990

++	BUG_ON(bfqd && !bfqd->root_group);

991

++

992

++	entity = bfqg->my_entity;

993

++

994

++	if (!entity) /* root group */

995

++		return;

996

++

997

++	/*

998

++	 * Empty all service_trees belonging to this group before

999

++	 * deactivating the group itself.

1000

++	 */

1001

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

1002

++		BUG_ON(!bfqg->sched_data.service_tree);

1003

++		st = bfqg->sched_data.service_tree + i;

1004

++		/*

1005

++		 * The idle tree may still contain bfq_queues belonging

1006

++		 * to exited task because they never migrated to a different

1007

++		 * cgroup from the one being destroyed now.  No one else

1008

++		 * can access them so it's safe to act without any lock.

1009

++		 */

1010

++		bfq_flush_idle_tree(st);

1011

++

1012

++		/*

1013

++		 * It may happen that some queues are still active

1014

++		 * (busy) upon group destruction (if the corresponding

1015

++		 * processes have been forced to terminate). We move

1016

++		 * all the leaf entities corresponding to these queues

1017

++		 * to the root_group.

1018

++		 * Also, it may happen that the group has an entity

1019

++		 * in service, which is disconnected from the active

1020

++		 * tree: it must be moved, too.

1021

++		 * There is no need to put the sync queues, as the

1022

++		 * scheduler has taken no reference.

1023

++		 */

1024

++		bfq_reparent_active_entities(bfqd, bfqg, st);

1025

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

1026

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

1027

++	}

1028

++	BUG_ON(bfqg->sched_data.next_in_service);

1029

++	BUG_ON(bfqg->sched_data.in_service_entity);

1030

++

1031

++	__bfq_deactivate_entity(entity, 0);

1032

++	bfq_put_async_queues(bfqd, bfqg);

1033

++	BUG_ON(entity->tree);

1034

++

1035

++	bfqg_stats_xfer_dead(bfqg);

1036

++}

1037

++

1038

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1039

++{

1040

++	struct blkcg_gq *blkg;

1041

++

1042

++	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {

1043

++		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

1044

++

1045

++		bfq_end_wr_async_queues(bfqd, bfqg);

1046

++	}

1047

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1048

++}

1049

++

1050

++static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,

1051

++				       struct cftype *cftype)

1052

++{

1053

++	struct blkcg *blkcg = css_to_blkcg(css);

1054

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1055

++	int ret = -EINVAL;

1056

++

1057

++	spin_lock_irq(&blkcg->lock);

1058

++	ret = bfqgd->weight;

1059

++	spin_unlock_irq(&blkcg->lock);

1060

++

1061

++	return ret;

1062

++}

1063

++

1064

++static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)

1065

++{

1066

++	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));

1067

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1068

++

1069

++	spin_lock_irq(&blkcg->lock);

1070

++	seq_printf(sf, "%u\n", bfqgd->weight);

1071

++	spin_unlock_irq(&blkcg->lock);

1072

++

1073

++	return 0;

1074

++}

1075

++

1076

++static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,

1077

++					struct cftype *cftype,

1078

++					u64 val)

1079

++{

1080

++	struct blkcg *blkcg = css_to_blkcg(css);

1081

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1082

++	struct blkcg_gq *blkg;

1083

++	int ret = -EINVAL;

1084

++

1085

++	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)

1086

++		return ret;

1087

++

1088

++	ret = 0;

1089

++	spin_lock_irq(&blkcg->lock);

1090

++	bfqgd->weight = (unsigned short)val;

1091

++	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {

1092

++		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

1093

++		if (!bfqg)

1094

++			continue;

1095

++		/*

1096

++		 * Setting the prio_changed flag of the entity

1097

++		 * to 1 with new_weight == weight would re-set

1098

++		 * the value of the weight to its ioprio mapping.

1099

++		 * Set the flag only if necessary.

1100

++		 */

1101

++		if ((unsigned short)val != bfqg->entity.new_weight) {

1102

++			bfqg->entity.new_weight = (unsigned short)val;

1103

++			/*

1104

++			 * Make sure that the above new value has been

1105

++			 * stored in bfqg->entity.new_weight before

1106

++			 * setting the prio_changed flag. In fact,

1107

++			 * this flag may be read asynchronously (in

1108

++			 * critical sections protected by a different

1109

++			 * lock than that held here), and finding this

1110

++			 * flag set may cause the execution of the code

1111

++			 * for updating parameters whose value may

1112

++			 * depend also on bfqg->entity.new_weight (in

1113

++			 * __bfq_entity_update_weight_prio).

1114

++			 * This barrier makes sure that the new value

1115

++			 * of bfqg->entity.new_weight is correctly

1116

++			 * seen in that code.

1117

++			 */

1118

++			smp_wmb();

1119

++			bfqg->entity.prio_changed = 1;

1120

++		}

1121

++	}

1122

++	spin_unlock_irq(&blkcg->lock);

1123

++

1124

++	return ret;

1125

++}

1126

++

1127

++static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,

1128

++					     char *buf, size_t nbytes,

1129

++					     loff_t off)

1130

++{

1131

++	/* First unsigned long found in the file is used */

1132

++	return bfqio_cgroup_weight_write(of_css(of), NULL,

1133

++					 simple_strtoull(strim(buf), NULL, 0));

1134

++}

1135

++

1136

++static int bfqg_print_stat(struct seq_file *sf, void *v)

1137

++{

1138

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,

1139

++			  &blkcg_policy_bfq, seq_cft(sf)->private, false);

1140

++	return 0;

1141

++}

1142

++

1143

++static int bfqg_print_rwstat(struct seq_file *sf, void *v)

1144

++{

1145

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,

1146

++			  &blkcg_policy_bfq, seq_cft(sf)->private, true);

1147

++	return 0;

1148

++}

1149

++

1150

++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,

1151

++				      struct blkg_policy_data *pd, int off)

1152

++{

1153

++	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);

1154

++

1155

++	return __blkg_prfill_u64(sf, pd, sum);

1156

++}

1157

++

1158

++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,

1159

++					struct blkg_policy_data *pd, int off)

1160

++{

1161

++	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);

1162

++

1163

++	return __blkg_prfill_rwstat(sf, pd, &sum);

1164

++}

1165

++

1166

++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)

1167

++{

1168

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1169

++			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,

1170

++			  seq_cft(sf)->private, false);

1171

++	return 0;

1172

++}

1173

++

1174

++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)

1175

++{

1176

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1177

++			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,

1178

++			  seq_cft(sf)->private, true);

1179

++	return 0;

1180

++}

1181

++

1182

++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,

1183

++				      struct blkg_policy_data *pd, int off)

1184

++{

1185

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

1186

++	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);

1187

++	u64 v = 0;

1188

++

1189

++	if (samples) {

1190

++		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);

1191

++		v = div64_u64(v, samples);

1192

++	}

1193

++	__blkg_prfill_u64(sf, pd, v);

1194

++	return 0;

1195

++}

1196

++

1197

++/* print avg_queue_size */

1198

++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)

1199

++{

1200

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1201

++			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,

1202

++			  0, false);

1203

++	return 0;

1204

++}

1205

++

1206

++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

1207

++{

1208

++	int ret;

1209

++

1210

++	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);

1211

++	if (ret)

1212

++		return NULL;

1213

++

1214

++        return blkg_to_bfqg(bfqd->queue->root_blkg);

1215

++}

1216

++

1217

++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)

1218

++{

1219

++        struct bfq_group_data *bgd;

1220

++

1221

++        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);

1222

++        if (!bgd)

1223

++                return NULL;

1224

++        return &bgd->pd;

1225

++}

1226

++

1227

++static void bfq_cpd_free(struct blkcg_policy_data *cpd)

1228

++{

1229

++        kfree(cpd_to_bfqgd(cpd));

1230

++}

1231

++

1232

++static struct cftype bfqio_files_dfl[] = {

1233

++	{

1234

++		.name = "weight",

1235

++		.flags = CFTYPE_NOT_ON_ROOT,

1236

++		.seq_show = bfqio_cgroup_weight_read_dfl,

1237

++		.write = bfqio_cgroup_weight_write_dfl,

1238

++	},

1239

++	{} /* terminate */

1240

++};

1241

++

1242

++static struct cftype bfqio_files[] = {

1243

++	{

1244

++		.name = "bfq.weight",

1245

++		.read_u64 = bfqio_cgroup_weight_read,

1246

++		.write_u64 = bfqio_cgroup_weight_write,

1247

++	},

1248

++	/* statistics, cover only the tasks in the bfqg */

1249

++	{

1250

++		.name = "bfq.time",

1251

++		.private = offsetof(struct bfq_group, stats.time),

1252

++		.seq_show = bfqg_print_stat,

1253

++	},

1254

++	{

1255

++		.name = "bfq.sectors",

1256

++		.private = offsetof(struct bfq_group, stats.sectors),

1257

++		.seq_show = bfqg_print_stat,

1258

++	},

1259

++	{

1260

++		.name = "bfq.io_service_bytes",

1261

++		.private = offsetof(struct bfq_group, stats.service_bytes),

1262

++		.seq_show = bfqg_print_rwstat,

1263

++	},

1264

++	{

1265

++		.name = "bfq.io_serviced",

1266

++		.private = offsetof(struct bfq_group, stats.serviced),

1267

++		.seq_show = bfqg_print_rwstat,

1268

++	},

1269

++	{

1270

++		.name = "bfq.io_service_time",

1271

++		.private = offsetof(struct bfq_group, stats.service_time),

1272

++		.seq_show = bfqg_print_rwstat,

1273

++	},

1274

++	{

1275

++		.name = "bfq.io_wait_time",

1276

++		.private = offsetof(struct bfq_group, stats.wait_time),

1277

++		.seq_show = bfqg_print_rwstat,

1278

++	},

1279

++	{

1280

++		.name = "bfq.io_merged",

1281

++		.private = offsetof(struct bfq_group, stats.merged),

1282

++		.seq_show = bfqg_print_rwstat,

1283

++	},

1284

++	{

1285

++		.name = "bfq.io_queued",

1286

++		.private = offsetof(struct bfq_group, stats.queued),

1287

++		.seq_show = bfqg_print_rwstat,

1288

++	},

1289

++

1290

++	/* the same statictics which cover the bfqg and its descendants */

1291

++	{

1292

++		.name = "bfq.time_recursive",

1293

++		.private = offsetof(struct bfq_group, stats.time),

1294

++		.seq_show = bfqg_print_stat_recursive,

1295

++	},

1296

++	{

1297

++		.name = "bfq.sectors_recursive",

1298

++		.private = offsetof(struct bfq_group, stats.sectors),

1299

++		.seq_show = bfqg_print_stat_recursive,

1300

++	},

1301

++	{

1302

++		.name = "bfq.io_service_bytes_recursive",

1303

++		.private = offsetof(struct bfq_group, stats.service_bytes),

1304

++		.seq_show = bfqg_print_rwstat_recursive,

1305

++	},

1306

++	{

1307

++		.name = "bfq.io_serviced_recursive",

1308

++		.private = offsetof(struct bfq_group, stats.serviced),

1309

++		.seq_show = bfqg_print_rwstat_recursive,

1310

++	},

1311

++	{

1312

++		.name = "bfq.io_service_time_recursive",

1313

++		.private = offsetof(struct bfq_group, stats.service_time),

1314

++		.seq_show = bfqg_print_rwstat_recursive,

1315

++	},

1316

++	{

1317

++		.name = "bfq.io_wait_time_recursive",

1318

++		.private = offsetof(struct bfq_group, stats.wait_time),

1319

++		.seq_show = bfqg_print_rwstat_recursive,

1320

++	},

1321

++	{

1322

++		.name = "bfq.io_merged_recursive",

1323

++		.private = offsetof(struct bfq_group, stats.merged),

1324

++		.seq_show = bfqg_print_rwstat_recursive,

1325

++	},

1326

++	{

1327

++		.name = "bfq.io_queued_recursive",

1328

++		.private = offsetof(struct bfq_group, stats.queued),

1329

++		.seq_show = bfqg_print_rwstat_recursive,

1330

++	},

1331

++	{

1332

++		.name = "bfq.avg_queue_size",

1333

++		.seq_show = bfqg_print_avg_queue_size,

1334

++	},

1335

++	{

1336

++		.name = "bfq.group_wait_time",

1337

++		.private = offsetof(struct bfq_group, stats.group_wait_time),

1338

++		.seq_show = bfqg_print_stat,

1339

++	},

1340

++	{

1341

++		.name = "bfq.idle_time",

1342

++		.private = offsetof(struct bfq_group, stats.idle_time),

1343

++		.seq_show = bfqg_print_stat,

1344

++	},

1345

++	{

1346

++		.name = "bfq.empty_time",

1347

++		.private = offsetof(struct bfq_group, stats.empty_time),

1348

++		.seq_show = bfqg_print_stat,

1349

++	},

1350

++	{

1351

++		.name = "bfq.dequeue",

1352

++		.private = offsetof(struct bfq_group, stats.dequeue),

1353

++		.seq_show = bfqg_print_stat,

1354

++	},

1355

++	{

1356

++		.name = "bfq.unaccounted_time",

1357

++		.private = offsetof(struct bfq_group, stats.unaccounted_time),

1358

++		.seq_show = bfqg_print_stat,

1359

++	},

1360

++	{ }	/* terminate */

1361

++};

1362

++

1363

++static struct blkcg_policy blkcg_policy_bfq = {

1364

++       .dfl_cftypes            = bfqio_files_dfl,

1365

++       .legacy_cftypes         = bfqio_files,

1366

++

1367

++       .pd_alloc_fn            = bfq_pd_alloc,

1368

++       .pd_init_fn             = bfq_pd_init,

1369

++       .pd_offline_fn          = bfq_pd_offline,

1370

++       .pd_free_fn             = bfq_pd_free,

1371

++       .pd_reset_stats_fn      = bfq_pd_reset_stats,

1372

++

1373

++       .cpd_alloc_fn           = bfq_cpd_alloc,

1374

++       .cpd_init_fn            = bfq_cpd_init,

1375

++       .cpd_bind_fn	       = bfq_cpd_init,

1376

++       .cpd_free_fn            = bfq_cpd_free,

1377

++

1378

++};

1379

++

1380

++#else

1381

++

1382

++static void bfq_init_entity(struct bfq_entity *entity,

1383

++			    struct bfq_group *bfqg)

1384

++{

1385

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

1386

++	entity->weight = entity->new_weight;

1387

++	entity->orig_weight = entity->new_weight;

1388

++	if (bfqq) {

1389

++		bfqq->ioprio = bfqq->new_ioprio;

1390

++		bfqq->ioprio_class = bfqq->new_ioprio_class;

1391

++	}

1392

++	entity->sched_data = &bfqg->sched_data;

1393

++}

1394

++

1395

++static struct bfq_group *

1396

++bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

1397

++{

1398

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1399

++	return bfqd->root_group;

1400

++}

1401

++

1402

++static void bfq_bfqq_move(struct bfq_data *bfqd,

1403

++			  struct bfq_queue *bfqq,

1404

++			  struct bfq_entity *entity,

1405

++			  struct bfq_group *bfqg)

1406

++{

1407

++}

1408

++

1409

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1410

++{

1411

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1412

++}

1413

++

1414

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

1415

++{

1416

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1417

++}

1418

++

1419

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

1420

++                                              struct blkcg *blkcg)

1421

++{

1422

++	return bfqd->root_group;

1423

++}

1424

++

1425

++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

1426

++{

1427

++	struct bfq_group *bfqg;

1428

++	int i;

1429

++

1430

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1431

++	if (!bfqg)

1432

++		return NULL;

1433

++

1434

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1435

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1436

++

1437

++	return bfqg;

1438

++}

1439

++#endif

1440

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1441

+new file mode 100644

1442

+index 0000000..fb7bb8f

1443

+--- /dev/null

1444

++++ b/block/bfq-ioc.c

1445

+@@ -0,0 +1,36 @@

1446

++/*

1447

++ * BFQ: I/O context handling.

1448

++ *

1449

++ * Based on ideas and code from CFQ:

1450

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1451

++ *

1452

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1453

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1454

++ *

1455

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1456

++ */

1457

++

1458

++/**

1459

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1460

++ * @icq: the iocontext queue.

1461

++ */

1462

++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1463

++{

1464

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1465

++	return container_of(icq, struct bfq_io_cq, icq);

1466

++}

1467

++

1468

++/**

1469

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1470

++ * @bfqd: the lookup key.

1471

++ * @ioc: the io_context of the process doing I/O.

1472

++ *

1473

++ * Queue lock must be held.

1474

++ */

1475

++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1476

++					struct io_context *ioc)

1477

++{

1478

++	if (ioc)

1479

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1480

++	return NULL;

1481

++}

1482

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1483

+new file mode 100644

1484

+index 0000000..f9787a6

1485

+--- /dev/null

1486

++++ b/block/bfq-iosched.c

1487

+@@ -0,0 +1,3754 @@

1488

++/*

1489

++ * Budget Fair Queueing (BFQ) disk scheduler.

1490

++ *

1491

++ * Based on ideas and code from CFQ:

1492

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1493

++ *

1494

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1495

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1496

++ *

1497

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1498

++ *

1499

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

1500

++ * file.

1501

++ *

1502

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on

1503

++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1504

++ * measured in number of sectors, to processes instead of time slices. The

1505

++ * device is not granted to the in-service process for a given time slice,

1506

++ * but until it has exhausted its assigned budget. This change from the time

1507

++ * to the service domain allows BFQ to distribute the device throughput

1508

++ * among processes as desired, without any distortion due to ZBR, workload

1509

++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

1510

++ * called B-WF2Q+, to schedule processes according to their budgets. More

1511

++ * precisely, BFQ schedules queues associated to processes. Thanks to the

1512

++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

1513

++ * I/O-bound processes issuing sequential requests (to boost the

1514

++ * throughput), and yet guarantee a low latency to interactive and soft

1515

++ * real-time applications.

1516

++ *

1517

++ * BFQ is described in [1], where also a reference to the initial, more

1518

++ * theoretical paper on BFQ can be found. The interested reader can find

1519

++ * in the latter paper full details on the main algorithm, as well as

1520

++ * formulas of the guarantees and formal proofs of all the properties.

1521

++ * With respect to the version of BFQ presented in these papers, this

1522

++ * implementation adds a few more heuristics, such as the one that

1523

++ * guarantees a low latency to soft real-time applications, and a

1524

++ * hierarchical extension based on H-WF2Q+.

1525

++ *

1526

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1527

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1528

++ * complexity derives from the one introduced with EEVDF in [3].

1529

++ *

1530

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1531

++ *     with the BFQ Disk I/O Scheduler'',

1532

++ *     Proceedings of the 5th Annual International Systems and Storage

1533

++ *     Conference (SYSTOR '12), June 2012.

1534

++ *

1535

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1536

++ *

1537

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1538

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1539

++ *     Oct 1997.

1540

++ *

1541

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1542

++ *

1543

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1544

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1545

++ *     Resource Allocation,'' technical report.

1546

++ *

1547

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1548

++ */

1549

++#include <linux/module.h>

1550

++#include <linux/slab.h>

1551

++#include <linux/blkdev.h>

1552

++#include <linux/cgroup.h>

1553

++#include <linux/elevator.h>

1554

++#include <linux/jiffies.h>

1555

++#include <linux/rbtree.h>

1556

++#include <linux/ioprio.h>

1557

++#include "bfq.h"

1558

++#include "blk.h"

1559

++

1560

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1561

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1562

++

1563

++/* Maximum backwards seek, in KiB. */

1564

++static const int bfq_back_max = 16 * 1024;

1565

++

1566

++/* Penalty of a backwards seek, in number of sectors. */

1567

++static const int bfq_back_penalty = 2;

1568

++

1569

++/* Idling period duration, in jiffies. */

1570

++static int bfq_slice_idle = HZ / 125;

1571

++

1572

++/* Minimum number of assigned budgets for which stats are safe to compute. */

1573

++static const int bfq_stats_min_budgets = 194;

1574

++

1575

++/* Default maximum budget values, in sectors and number of requests. */

1576

++static const int bfq_default_max_budget = 16 * 1024;

1577

++static const int bfq_max_budget_async_rq = 4;

1578

++

1579

++/*

1580

++ * Async to sync throughput distribution is controlled as follows:

1581

++ * when an async request is served, the entity is charged the number

1582

++ * of sectors of the request, multiplied by the factor below

1583

++ */

1584

++static const int bfq_async_charge_factor = 10;

1585

++

1586

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1587

++static const int bfq_timeout_sync = HZ / 8;

1588

++static int bfq_timeout_async = HZ / 25;

1589

++

1590

++struct kmem_cache *bfq_pool;

1591

++

1592

++/* Below this threshold (in ms), we consider thinktime immediate. */

1593

++#define BFQ_MIN_TT		2

1594

++

1595

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1596

++#define BFQ_HW_QUEUE_THRESHOLD	4

1597

++#define BFQ_HW_QUEUE_SAMPLES	32

1598

++

1599

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1600

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1601

++

1602

++/* Min samples used for peak rate estimation (for autotuning). */

1603

++#define BFQ_PEAK_RATE_SAMPLES	32

1604

++

1605

++/* Shift used for peak rate fixed precision calculations. */

1606

++#define BFQ_RATE_SHIFT		16

1607

++

1608

++/*

1609

++ * By default, BFQ computes the duration of the weight raising for

1610

++ * interactive applications automatically, using the following formula:

1611

++ * duration = (R / r) * T, where r is the peak rate of the device, and

1612

++ * R and T are two reference parameters.

1613

++ * In particular, R is the peak rate of the reference device (see below),

1614

++ * and T is a reference time: given the systems that are likely to be

1615

++ * installed on the reference device according to its speed class, T is

1616

++ * about the maximum time needed, under BFQ and while reading two files in

1617

++ * parallel, to load typical large applications on these systems.

1618

++ * In practice, the slower/faster the device at hand is, the more/less it

1619

++ * takes to load applications with respect to the reference device.

1620

++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive

1621

++ * applications.

1622

++ *

1623

++ * BFQ uses four different reference pairs (R, T), depending on:

1624

++ * . whether the device is rotational or non-rotational;

1625

++ * . whether the device is slow, such as old or portable HDDs, as well as

1626

++ *   SD cards, or fast, such as newer HDDs and SSDs.

1627

++ *

1628

++ * The device's speed class is dynamically (re)detected in

1629

++ * bfq_update_peak_rate() every time the estimated peak rate is updated.

1630

++ *

1631

++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

1632

++ * are the reference values for a slow/fast rotational device, whereas

1633

++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

1634

++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the

1635

++ * thresholds used to switch between speed classes.

1636

++ * Both the reference peak rates and the thresholds are measured in

1637

++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

1638

++ */

1639

++static int R_slow[2] = {1536, 10752};

1640

++static int R_fast[2] = {17415, 34791};

1641

++/*

1642

++ * To improve readability, a conversion function is used to initialize the

1643

++ * following arrays, which entails that they can be initialized only in a

1644

++ * function.

1645

++ */

1646

++static int T_slow[2];

1647

++static int T_fast[2];

1648

++static int device_speed_thresh[2];

1649

++

1650

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1651

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1652

++

1653

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1654

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1655

++

1656

++static void bfq_schedule_dispatch(struct bfq_data *bfqd);

1657

++

1658

++#include "bfq-ioc.c"

1659

++#include "bfq-sched.c"

1660

++#include "bfq-cgroup.c"

1661

++

1662

++#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

1663

++#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)

1664

++

1665

++#define bfq_sample_valid(samples)	((samples) > 80)

1666

++

1667

++/*

1668

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1669

++ * set (in which case it could also be a direct WRITE).

1670

++ */

1671

++static int bfq_bio_sync(struct bio *bio)

1672

++{

1673

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1674

++		return 1;

1675

++

1676

++	return 0;

1677

++}

1678

++

1679

++/*

1680

++ * Scheduler run of queue, if there are requests pending and no one in the

1681

++ * driver that will restart queueing.

1682

++ */

1683

++static void bfq_schedule_dispatch(struct bfq_data *bfqd)

1684

++{

1685

++	if (bfqd->queued != 0) {

1686

++		bfq_log(bfqd, "schedule dispatch");

1687

++		kblockd_schedule_work(&bfqd->unplug_work);

1688

++	}

1689

++}

1690

++

1691

++/*

1692

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1693

++ * We choose the request that is closesr to the head right now.  Distance

1694

++ * behind the head is penalized and only allowed to a certain extent.

1695

++ */

1696

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1697

++				      struct request *rq1,

1698

++				      struct request *rq2,

1699

++				      sector_t last)

1700

++{

1701

++	sector_t s1, s2, d1 = 0, d2 = 0;

1702

++	unsigned long back_max;

1703

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1704

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1705

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1706

++

1707

++	if (!rq1 || rq1 == rq2)

1708

++		return rq2;

1709

++	if (!rq2)

1710

++		return rq1;

1711

++

1712

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1713

++		return rq1;

1714

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1715

++		return rq2;

1716

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1717

++		return rq1;

1718

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1719

++		return rq2;

1720

++

1721

++	s1 = blk_rq_pos(rq1);

1722

++	s2 = blk_rq_pos(rq2);

1723

++

1724

++	/*

1725

++	 * By definition, 1KiB is 2 sectors.

1726

++	 */

1727

++	back_max = bfqd->bfq_back_max * 2;

1728

++

1729

++	/*

1730

++	 * Strict one way elevator _except_ in the case where we allow

1731

++	 * short backward seeks which are biased as twice the cost of a

1732

++	 * similar forward seek.

1733

++	 */

1734

++	if (s1 >= last)

1735

++		d1 = s1 - last;

1736

++	else if (s1 + back_max >= last)

1737

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1738

++	else

1739

++		wrap |= BFQ_RQ1_WRAP;

1740

++

1741

++	if (s2 >= last)

1742

++		d2 = s2 - last;

1743

++	else if (s2 + back_max >= last)

1744

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1745

++	else

1746

++		wrap |= BFQ_RQ2_WRAP;

1747

++

1748

++	/* Found required data */

1749

++

1750

++	/*

1751

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1752

++	 * check two variables for all permutations: --> faster!

1753

++	 */

1754

++	switch (wrap) {

1755

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1756

++		if (d1 < d2)

1757

++			return rq1;

1758

++		else if (d2 < d1)

1759

++			return rq2;

1760

++		else {

1761

++			if (s1 >= s2)

1762

++				return rq1;

1763

++			else

1764

++				return rq2;

1765

++		}

1766

++

1767

++	case BFQ_RQ2_WRAP:

1768

++		return rq1;

1769

++	case BFQ_RQ1_WRAP:

1770

++		return rq2;

1771

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1772

++	default:

1773

++		/*

1774

++		 * Since both rqs are wrapped,

1775

++		 * start with the one that's further behind head

1776

++		 * (--> only *one* back seek required),

1777

++		 * since back seek takes more time than forward.

1778

++		 */

1779

++		if (s1 <= s2)

1780

++			return rq1;

1781

++		else

1782

++			return rq2;

1783

++	}

1784

++}

1785

++

1786

++/*

1787

++ * Tell whether there are active queues or groups with differentiated weights.

1788

++ */

1789

++static bool bfq_differentiated_weights(struct bfq_data *bfqd)

1790

++{

1791

++	/*

1792

++	 * For weights to differ, at least one of the trees must contain

1793

++	 * at least two nodes.

1794

++	 */

1795

++	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&

1796

++		(bfqd->queue_weights_tree.rb_node->rb_left ||

1797

++		 bfqd->queue_weights_tree.rb_node->rb_right)

1798

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

1799

++	       ) ||

1800

++	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&

1801

++		(bfqd->group_weights_tree.rb_node->rb_left ||

1802

++		 bfqd->group_weights_tree.rb_node->rb_right)

1803

++#endif

1804

++	       );

1805

++}

1806

++

1807

++/*

1808

++ * The following function returns true if every queue must receive the

1809

++ * same share of the throughput (this condition is used when deciding

1810

++ * whether idling may be disabled, see the comments in the function

1811

++ * bfq_bfqq_may_idle()).

1812

++ *

1813

++ * Such a scenario occurs when:

1814

++ * 1) all active queues have the same weight,

1815

++ * 2) all active groups at the same level in the groups tree have the same

1816

++ *    weight,

1817

++ * 3) all active groups at the same level in the groups tree have the same

1818

++ *    number of children.

1819

++ *

1820

++ * Unfortunately, keeping the necessary state for evaluating exactly the

1821

++ * above symmetry conditions would be quite complex and time-consuming.

1822

++ * Therefore this function evaluates, instead, the following stronger

1823

++ * sub-conditions, for which it is much easier to maintain the needed

1824

++ * state:

1825

++ * 1) all active queues have the same weight,

1826

++ * 2) all active groups have the same weight,

1827

++ * 3) all active groups have at most one active child each.

1828

++ * In particular, the last two conditions are always true if hierarchical

1829

++ * support and the cgroups interface are not enabled, thus no state needs

1830

++ * to be maintained in this case.

1831

++ */

1832

++static bool bfq_symmetric_scenario(struct bfq_data *bfqd)

1833

++{

1834

++	return

1835

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

1836

++		!bfqd->active_numerous_groups &&

1837

++#endif

1838

++		!bfq_differentiated_weights(bfqd);

1839

++}

1840

++

1841

++/*

1842

++ * If the weight-counter tree passed as input contains no counter for

1843

++ * the weight of the input entity, then add that counter; otherwise just

1844

++ * increment the existing counter.

1845

++ *

1846

++ * Note that weight-counter trees contain few nodes in mostly symmetric

1847

++ * scenarios. For example, if all queues have the same weight, then the

1848

++ * weight-counter tree for the queues may contain at most one node.

1849

++ * This holds even if low_latency is on, because weight-raised queues

1850

++ * are not inserted in the tree.

1851

++ * In most scenarios, the rate at which nodes are created/destroyed

1852

++ * should be low too.

1853

++ */

1854

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

1855

++				 struct bfq_entity *entity,

1856

++				 struct rb_root *root)

1857

++{

1858

++	struct rb_node **new = &(root->rb_node), *parent = NULL;

1859

++

1860

++	/*

1861

++	 * Do not insert if the entity is already associated with a

1862

++	 * counter, which happens if:

1863

++	 *   1) the entity is associated with a queue,

1864

++	 *   2) a request arrival has caused the queue to become both

1865

++	 *      non-weight-raised, and hence change its weight, and

1866

++	 *      backlogged; in this respect, each of the two events

1867

++	 *      causes an invocation of this function,

1868

++	 *   3) this is the invocation of this function caused by the

1869

++	 *      second event. This second invocation is actually useless,

1870

++	 *      and we handle this fact by exiting immediately. More

1871

++	 *      efficient or clearer solutions might possibly be adopted.

1872

++	 */

1873

++	if (entity->weight_counter)

1874

++		return;

1875

++

1876

++	while (*new) {

1877

++		struct bfq_weight_counter *__counter = container_of(*new,

1878

++						struct bfq_weight_counter,

1879

++						weights_node);

1880

++		parent = *new;

1881

++

1882

++		if (entity->weight == __counter->weight) {

1883

++			entity->weight_counter = __counter;

1884

++			goto inc_counter;

1885

++		}

1886

++		if (entity->weight < __counter->weight)

1887

++			new = &((*new)->rb_left);

1888

++		else

1889

++			new = &((*new)->rb_right);

1890

++	}

1891

++

1892

++	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),

1893

++					 GFP_ATOMIC);

1894

++	entity->weight_counter->weight = entity->weight;

1895

++	rb_link_node(&entity->weight_counter->weights_node, parent, new);

1896

++	rb_insert_color(&entity->weight_counter->weights_node, root);

1897

++

1898

++inc_counter:

1899

++	entity->weight_counter->num_active++;

1900

++}

1901

++

1902

++/*

1903

++ * Decrement the weight counter associated with the entity, and, if the

1904

++ * counter reaches 0, remove the counter from the tree.

1905

++ * See the comments to the function bfq_weights_tree_add() for considerations

1906

++ * about overhead.

1907

++ */

1908

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

1909

++				    struct bfq_entity *entity,

1910

++				    struct rb_root *root)

1911

++{

1912

++	if (!entity->weight_counter)

1913

++		return;

1914

++

1915

++	BUG_ON(RB_EMPTY_ROOT(root));

1916

++	BUG_ON(entity->weight_counter->weight != entity->weight);

1917

++

1918

++	BUG_ON(!entity->weight_counter->num_active);

1919

++	entity->weight_counter->num_active--;

1920

++	if (entity->weight_counter->num_active > 0)

1921

++		goto reset_entity_pointer;

1922

++

1923

++	rb_erase(&entity->weight_counter->weights_node, root);

1924

++	kfree(entity->weight_counter);

1925

++

1926

++reset_entity_pointer:

1927

++	entity->weight_counter = NULL;

1928

++}

1929

++

1930

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1931

++					struct bfq_queue *bfqq,

1932

++					struct request *last)

1933

++{

1934

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1935

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1936

++	struct request *next = NULL, *prev = NULL;

1937

++

1938

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1939

++

1940

++	if (rbprev)

1941

++		prev = rb_entry_rq(rbprev);

1942

++

1943

++	if (rbnext)

1944

++		next = rb_entry_rq(rbnext);

1945

++	else {

1946

++		rbnext = rb_first(&bfqq->sort_list);

1947

++		if (rbnext && rbnext != &last->rb_node)

1948

++			next = rb_entry_rq(rbnext);

1949

++	}

1950

++

1951

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1952

++}

1953

++

1954

++/* see the definition of bfq_async_charge_factor for details */

1955

++static unsigned long bfq_serv_to_charge(struct request *rq,

1956

++					struct bfq_queue *bfqq)

1957

++{

1958

++	return blk_rq_sectors(rq) *

1959

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

1960

++		bfq_async_charge_factor));

1961

++}

1962

++

1963

++/**

1964

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1965

++ * @bfqd: the device data the queue belongs to.

1966

++ * @bfqq: the queue to update.

1967

++ *

1968

++ * If the first request of a queue changes we make sure that the queue

1969

++ * has enough budget to serve at least its first request (if the

1970

++ * request has grown).  We do this because if the queue has not enough

1971

++ * budget for its first request, it has to go through two dispatch

1972

++ * rounds to actually get it dispatched.

1973

++ */

1974

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1975

++				 struct bfq_queue *bfqq)

1976

++{

1977

++	struct bfq_entity *entity = &bfqq->entity;

1978

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1979

++	struct request *next_rq = bfqq->next_rq;

1980

++	unsigned long new_budget;

1981

++

1982

++	if (!next_rq)

1983

++		return;

1984

++

1985

++	if (bfqq == bfqd->in_service_queue)

1986

++		/*

1987

++		 * In order not to break guarantees, budgets cannot be

1988

++		 * changed after an entity has been selected.

1989

++		 */

1990

++		return;

1991

++

1992

++	BUG_ON(entity->tree != &st->active);

1993

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1994

++

1995

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1996

++			   bfq_serv_to_charge(next_rq, bfqq));

1997

++	if (entity->budget != new_budget) {

1998

++		entity->budget = new_budget;

1999

++		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",

2000

++					 new_budget);

2001

++		bfq_activate_bfqq(bfqd, bfqq);

2002

++	}

2003

++}

2004

++

2005

++static unsigned int bfq_wr_duration(struct bfq_data *bfqd)

2006

++{

2007

++	u64 dur;

2008

++

2009

++	if (bfqd->bfq_wr_max_time > 0)

2010

++		return bfqd->bfq_wr_max_time;

2011

++

2012

++	dur = bfqd->RT_prod;

2013

++	do_div(dur, bfqd->peak_rate);

2014

++

2015

++	return dur;

2016

++}

2017

++

2018

++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

2019

++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2020

++{

2021

++	struct bfq_queue *item;

2022

++	struct hlist_node *n;

2023

++

2024

++	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)

2025

++		hlist_del_init(&item->burst_list_node);

2026

++	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

2027

++	bfqd->burst_size = 1;

2028

++}

2029

++

2030

++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

2031

++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2032

++{

2033

++	/* Increment burst size to take into account also bfqq */

2034

++	bfqd->burst_size++;

2035

++

2036

++	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

2037

++		struct bfq_queue *pos, *bfqq_item;

2038

++		struct hlist_node *n;

2039

++

2040

++		/*

2041

++		 * Enough queues have been activated shortly after each

2042

++		 * other to consider this burst as large.

2043

++		 */

2044

++		bfqd->large_burst = true;

2045

++

2046

++		/*

2047

++		 * We can now mark all queues in the burst list as

2048

++		 * belonging to a large burst.

2049

++		 */

2050

++		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

2051

++				     burst_list_node)

2052

++		        bfq_mark_bfqq_in_large_burst(bfqq_item);

2053

++		bfq_mark_bfqq_in_large_burst(bfqq);

2054

++

2055

++		/*

2056

++		 * From now on, and until the current burst finishes, any

2057

++		 * new queue being activated shortly after the last queue

2058

++		 * was inserted in the burst can be immediately marked as

2059

++		 * belonging to a large burst. So the burst list is not

2060

++		 * needed any more. Remove it.

2061

++		 */

2062

++		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

2063

++					  burst_list_node)

2064

++			hlist_del_init(&pos->burst_list_node);

2065

++	} else /* burst not yet large: add bfqq to the burst list */

2066

++		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

2067

++}

2068

++

2069

++/*

2070

++ * If many queues happen to become active shortly after each other, then,

2071

++ * to help the processes associated to these queues get their job done as

2072

++ * soon as possible, it is usually better to not grant either weight-raising

2073

++ * or device idling to these queues. In this comment we describe, firstly,

2074

++ * the reasons why this fact holds, and, secondly, the next function, which

2075

++ * implements the main steps needed to properly mark these queues so that

2076

++ * they can then be treated in a different way.

2077

++ *

2078

++ * As for the terminology, we say that a queue becomes active, i.e.,

2079

++ * switches from idle to backlogged, either when it is created (as a

2080

++ * consequence of the arrival of an I/O request), or, if already existing,

2081

++ * when a new request for the queue arrives while the queue is idle.

2082

++ * Bursts of activations, i.e., activations of different queues occurring

2083

++ * shortly after each other, are typically caused by services or applications

2084

++ * that spawn or reactivate many parallel threads/processes. Examples are

2085

++ * systemd during boot or git grep.

2086

++ *

2087

++ * These services or applications benefit mostly from a high throughput:

2088

++ * the quicker the requests of the activated queues are cumulatively served,

2089

++ * the sooner the target job of these queues gets completed. As a consequence,

2090

++ * weight-raising any of these queues, which also implies idling the device

2091

++ * for it, is almost always counterproductive: in most cases it just lowers

2092

++ * throughput.

2093

++ *

2094

++ * On the other hand, a burst of activations may be also caused by the start

2095

++ * of an application that does not consist in a lot of parallel I/O-bound

2096

++ * threads. In fact, with a complex application, the burst may be just a

2097

++ * consequence of the fact that several processes need to be executed to

2098

++ * start-up the application. To start an application as quickly as possible,

2099

++ * the best thing to do is to privilege the I/O related to the application

2100

++ * with respect to all other I/O. Therefore, the best strategy to start as

2101

++ * quickly as possible an application that causes a burst of activations is

2102

++ * to weight-raise all the queues activated during the burst. This is the

2103

++ * exact opposite of the best strategy for the other type of bursts.

2104

++ *

2105

++ * In the end, to take the best action for each of the two cases, the two

2106

++ * types of bursts need to be distinguished. Fortunately, this seems

2107

++ * relatively easy to do, by looking at the sizes of the bursts. In

2108

++ * particular, we found a threshold such that bursts with a larger size

2109

++ * than that threshold are apparently caused only by services or commands

2110

++ * such as systemd or git grep. For brevity, hereafter we call just 'large'

2111

++ * these bursts. BFQ *does not* weight-raise queues whose activations occur

2112

++ * in a large burst. In addition, for each of these queues BFQ performs or

2113

++ * does not perform idling depending on which choice boosts the throughput

2114

++ * most. The exact choice depends on the device and request pattern at

2115

++ * hand.

2116

++ *

2117

++ * Turning back to the next function, it implements all the steps needed

2118

++ * to detect the occurrence of a large burst and to properly mark all the

2119

++ * queues belonging to it (so that they can then be treated in a different

2120

++ * way). This goal is achieved by maintaining a special "burst list" that

2121

++ * holds, temporarily, the queues that belong to the burst in progress. The

2122

++ * list is then used to mark these queues as belonging to a large burst if

2123

++ * the burst does become large. The main steps are the following.

2124

++ *

2125

++ * . when the very first queue is activated, the queue is inserted into the

2126

++ *   list (as it could be the first queue in a possible burst)

2127

++ *

2128

++ * . if the current burst has not yet become large, and a queue Q that does

2129

++ *   not yet belong to the burst is activated shortly after the last time

2130

++ *   at which a new queue entered the burst list, then the function appends

2131

++ *   Q to the burst list

2132

++ *

2133

++ * . if, as a consequence of the previous step, the burst size reaches

2134

++ *   the large-burst threshold, then

2135

++ *

2136

++ *     . all the queues in the burst list are marked as belonging to a

2137

++ *       large burst

2138

++ *

2139

++ *     . the burst list is deleted; in fact, the burst list already served

2140

++ *       its purpose (keeping temporarily track of the queues in a burst,

2141

++ *       so as to be able to mark them as belonging to a large burst in the

2142

++ *       previous sub-step), and now is not needed any more

2143

++ *

2144

++ *     . the device enters a large-burst mode

2145

++ *

2146

++ * . if a queue Q that does not belong to the burst is activated while

2147

++ *   the device is in large-burst mode and shortly after the last time

2148

++ *   at which a queue either entered the burst list or was marked as

2149

++ *   belonging to the current large burst, then Q is immediately marked

2150

++ *   as belonging to a large burst.

2151

++ *

2152

++ * . if a queue Q that does not belong to the burst is activated a while

2153

++ *   later, i.e., not shortly after, than the last time at which a queue

2154

++ *   either entered the burst list or was marked as belonging to the

2155

++ *   current large burst, then the current burst is deemed as finished and:

2156

++ *

2157

++ *        . the large-burst mode is reset if set

2158

++ *

2159

++ *        . the burst list is emptied

2160

++ *

2161

++ *        . Q is inserted in the burst list, as Q may be the first queue

2162

++ *          in a possible new burst (then the burst list contains just Q

2163

++ *          after this step).

2164

++ */

2165

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2166

++			     bool idle_for_long_time)

2167

++{

2168

++	/*

2169

++	 * If bfqq happened to be activated in a burst, but has been idle

2170

++	 * for at least as long as an interactive queue, then we assume

2171

++	 * that, in the overall I/O initiated in the burst, the I/O

2172

++	 * associated to bfqq is finished. So bfqq does not need to be

2173

++	 * treated as a queue belonging to a burst anymore. Accordingly,

2174

++	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

2175

++	 * from the burst list if it's there. We do not decrement instead

2176

++	 * burst_size, because the fact that bfqq does not need to belong

2177

++	 * to the burst list any more does not invalidate the fact that

2178

++	 * bfqq may have been activated during the current burst.

2179

++	 */

2180

++	if (idle_for_long_time) {

2181

++		hlist_del_init(&bfqq->burst_list_node);

2182

++		bfq_clear_bfqq_in_large_burst(bfqq);

2183

++	}

2184

++

2185

++	/*

2186

++	 * If bfqq is already in the burst list or is part of a large

2187

++	 * burst, then there is nothing else to do.

2188

++	 */

2189

++	if (!hlist_unhashed(&bfqq->burst_list_node) ||

2190

++	    bfq_bfqq_in_large_burst(bfqq))

2191

++		return;

2192

++

2193

++	/*

2194

++	 * If bfqq's activation happens late enough, then the current

2195

++	 * burst is finished, and related data structures must be reset.

2196

++	 *

2197

++	 * In this respect, consider the special case where bfqq is the very

2198

++	 * first queue being activated. In this case, last_ins_in_burst is

2199

++	 * not yet significant when we get here. But it is easy to verify

2200

++	 * that, whether or not the following condition is true, bfqq will

2201

++	 * end up being inserted into the burst list. In particular the

2202

++	 * list will happen to contain only bfqq. And this is exactly what

2203

++	 * has to happen, as bfqq may be the first queue in a possible

2204

++	 * burst.

2205

++	 */

2206

++	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

2207

++	    bfqd->bfq_burst_interval)) {

2208

++		bfqd->large_burst = false;

2209

++		bfq_reset_burst_list(bfqd, bfqq);

2210

++		return;

2211

++	}

2212

++

2213

++	/*

2214

++	 * If we get here, then bfqq is being activated shortly after the

2215

++	 * last queue. So, if the current burst is also large, we can mark

2216

++	 * bfqq as belonging to this large burst immediately.

2217

++	 */

2218

++	if (bfqd->large_burst) {

2219

++		bfq_mark_bfqq_in_large_burst(bfqq);

2220

++		return;

2221

++	}

2222

++

2223

++	/*

2224

++	 * If we get here, then a large-burst state has not yet been

2225

++	 * reached, but bfqq is being activated shortly after the last

2226

++	 * queue. Then we add bfqq to the burst.

2227

++	 */

2228

++	bfq_add_to_burst(bfqd, bfqq);

2229

++}

2230

++

2231

++static void bfq_add_request(struct request *rq)

2232

++{

2233

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2234

++	struct bfq_entity *entity = &bfqq->entity;

2235

++	struct bfq_data *bfqd = bfqq->bfqd;

2236

++	struct request *next_rq, *prev;

2237

++	unsigned long old_wr_coeff = bfqq->wr_coeff;

2238

++	bool interactive = false;

2239

++

2240

++	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

2241

++	bfqq->queued[rq_is_sync(rq)]++;

2242

++	bfqd->queued++;

2243

++

2244

++	elv_rb_add(&bfqq->sort_list, rq);

2245

++

2246

++	/*

2247

++	 * Check if this request is a better next-serve candidate.

2248

++	 */

2249

++	prev = bfqq->next_rq;

2250

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

2251

++	BUG_ON(!next_rq);

2252

++	bfqq->next_rq = next_rq;

2253

++

2254

++	if (!bfq_bfqq_busy(bfqq)) {

2255

++		bool soft_rt, in_burst,

2256

++		     idle_for_long_time = time_is_before_jiffies(

2257

++						bfqq->budget_timeout +

2258

++						bfqd->bfq_wr_min_idle_time);

2259

++

2260

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2261

++		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,

2262

++					 rq->cmd_flags);

2263

++#endif

2264

++		if (bfq_bfqq_sync(bfqq)) {

2265

++			bool already_in_burst =

2266

++			   !hlist_unhashed(&bfqq->burst_list_node) ||

2267

++			   bfq_bfqq_in_large_burst(bfqq);

2268

++			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

2269

++			/*

2270

++			 * If bfqq was not already in the current burst,

2271

++			 * then, at this point, bfqq either has been

2272

++			 * added to the current burst or has caused the

2273

++			 * current burst to terminate. In particular, in

2274

++			 * the second case, bfqq has become the first

2275

++			 * queue in a possible new burst.

2276

++			 * In both cases last_ins_in_burst needs to be

2277

++			 * moved forward.

2278

++			 */

2279

++			if (!already_in_burst)

2280

++				bfqd->last_ins_in_burst = jiffies;

2281

++		}

2282

++

2283

++		in_burst = bfq_bfqq_in_large_burst(bfqq);

2284

++		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

2285

++			!in_burst &&

2286

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

2287

++		interactive = !in_burst && idle_for_long_time;

2288

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

2289

++				       bfq_serv_to_charge(next_rq, bfqq));

2290

++

2291

++		if (!bfq_bfqq_IO_bound(bfqq)) {

2292

++			if (time_before(jiffies,

2293

++					RQ_BIC(rq)->ttime.last_end_request +

2294

++					bfqd->bfq_slice_idle)) {

2295

++				bfqq->requests_within_timer++;

2296

++				if (bfqq->requests_within_timer >=

2297

++				    bfqd->bfq_requests_within_timer)

2298

++					bfq_mark_bfqq_IO_bound(bfqq);

2299

++			} else

2300

++				bfqq->requests_within_timer = 0;

2301

++		}

2302

++

2303

++		if (!bfqd->low_latency)

2304

++			goto add_bfqq_busy;

2305

++

2306

++		/*

2307

++		 * If the queue:

2308

++		 * - is not being boosted,

2309

++		 * - has been idle for enough time,

2310

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

2311

++		 *   shared "for its nature" or it is not shared and its

2312

++		 *   requests have not been redirected to a shared queue)

2313

++		 * start a weight-raising period.

2314

++		 */

2315

++		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

2316

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {

2317

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2318

++			if (interactive)

2319

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2320

++			else

2321

++				bfqq->wr_cur_max_time =

2322

++					bfqd->bfq_wr_rt_max_time;

2323

++			bfq_log_bfqq(bfqd, bfqq,

2324

++				     "wrais starting at %lu, rais_max_time %u",

2325

++				     jiffies,

2326

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

2327

++		} else if (old_wr_coeff > 1) {

2328

++			if (interactive)

2329

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2330

++			else if (in_burst ||

2331

++				 (bfqq->wr_cur_max_time ==

2332

++				  bfqd->bfq_wr_rt_max_time &&

2333

++				  !soft_rt)) {

2334

++				bfqq->wr_coeff = 1;

2335

++				bfq_log_bfqq(bfqd, bfqq,

2336

++					"wrais ending at %lu, rais_max_time %u",

2337

++					jiffies,

2338

++					jiffies_to_msecs(bfqq->

2339

++						wr_cur_max_time));

2340

++			} else if (time_before(

2341

++					bfqq->last_wr_start_finish +

2342

++					bfqq->wr_cur_max_time,

2343

++					jiffies +

2344

++					bfqd->bfq_wr_rt_max_time) &&

2345

++				   soft_rt) {

2346

++				/*

2347

++				 *

2348

++				 * The remaining weight-raising time is lower

2349

++				 * than bfqd->bfq_wr_rt_max_time, which means

2350

++				 * that the application is enjoying weight

2351

++				 * raising either because deemed soft-rt in

2352

++				 * the near past, or because deemed interactive

2353

++				 * a long ago.

2354

++				 * In both cases, resetting now the current

2355

++				 * remaining weight-raising time for the

2356

++				 * application to the weight-raising duration

2357

++				 * for soft rt applications would not cause any

2358

++				 * latency increase for the application (as the

2359

++				 * new duration would be higher than the

2360

++				 * remaining time).

2361

++				 *

2362

++				 * In addition, the application is now meeting

2363

++				 * the requirements for being deemed soft rt.

2364

++				 * In the end we can correctly and safely

2365

++				 * (re)charge the weight-raising duration for

2366

++				 * the application with the weight-raising

2367

++				 * duration for soft rt applications.

2368

++				 *

2369

++				 * In particular, doing this recharge now, i.e.,

2370

++				 * before the weight-raising period for the

2371

++				 * application finishes, reduces the probability

2372

++				 * of the following negative scenario:

2373

++				 * 1) the weight of a soft rt application is

2374

++				 *    raised at startup (as for any newly

2375

++				 *    created application),

2376

++				 * 2) since the application is not interactive,

2377

++				 *    at a certain time weight-raising is

2378

++				 *    stopped for the application,

2379

++				 * 3) at that time the application happens to

2380

++				 *    still have pending requests, and hence

2381

++				 *    is destined to not have a chance to be

2382

++				 *    deemed soft rt before these requests are

2383

++				 *    completed (see the comments to the

2384

++				 *    function bfq_bfqq_softrt_next_start()

2385

++				 *    for details on soft rt detection),

2386

++				 * 4) these pending requests experience a high

2387

++				 *    latency because the application is not

2388

++				 *    weight-raised while they are pending.

2389

++				 */

2390

++				bfqq->last_wr_start_finish = jiffies;

2391

++				bfqq->wr_cur_max_time =

2392

++					bfqd->bfq_wr_rt_max_time;

2393

++			}

2394

++		}

2395

++		if (old_wr_coeff != bfqq->wr_coeff)

2396

++			entity->prio_changed = 1;

2397

++add_bfqq_busy:

2398

++		bfqq->last_idle_bklogged = jiffies;

2399

++		bfqq->service_from_backlogged = 0;

2400

++		bfq_clear_bfqq_softrt_update(bfqq);

2401

++		bfq_add_bfqq_busy(bfqd, bfqq);

2402

++	} else {

2403

++		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

2404

++		    time_is_before_jiffies(

2405

++				bfqq->last_wr_start_finish +

2406

++				bfqd->bfq_wr_min_inter_arr_async)) {

2407

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2408

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2409

++

2410

++			bfqd->wr_busy_queues++;

2411

++			entity->prio_changed = 1;

2412

++			bfq_log_bfqq(bfqd, bfqq,

2413

++			    "non-idle wrais starting at %lu, rais_max_time %u",

2414

++			    jiffies,

2415

++			    jiffies_to_msecs(bfqq->wr_cur_max_time));

2416

++		}

2417

++		if (prev != bfqq->next_rq)

2418

++			bfq_updated_next_req(bfqd, bfqq);

2419

++	}

2420

++

2421

++	if (bfqd->low_latency &&

2422

++		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

2423

++		bfqq->last_wr_start_finish = jiffies;

2424

++}

2425

++

2426

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

2427

++					  struct bio *bio)

2428

++{

2429

++	struct task_struct *tsk = current;

2430

++	struct bfq_io_cq *bic;

2431

++	struct bfq_queue *bfqq;

2432

++

2433

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

2434

++	if (!bic)

2435

++		return NULL;

2436

++

2437

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2438

++	if (bfqq)

2439

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

2440

++

2441

++	return NULL;

2442

++}

2443

++

2444

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

2445

++{

2446

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2447

++

2448

++	bfqd->rq_in_driver++;

2449

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

2450

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

2451

++		(long long unsigned)bfqd->last_position);

2452

++}

2453

++

2454

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

2455

++{

2456

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2457

++

2458

++	BUG_ON(bfqd->rq_in_driver == 0);

2459

++	bfqd->rq_in_driver--;

2460

++}

2461

++

2462

++static void bfq_remove_request(struct request *rq)

2463

++{

2464

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2465

++	struct bfq_data *bfqd = bfqq->bfqd;

2466

++	const int sync = rq_is_sync(rq);

2467

++

2468

++	if (bfqq->next_rq == rq) {

2469

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

2470

++		bfq_updated_next_req(bfqd, bfqq);

2471

++	}

2472

++

2473

++	if (rq->queuelist.prev != &rq->queuelist)

2474

++		list_del_init(&rq->queuelist);

2475

++	BUG_ON(bfqq->queued[sync] == 0);

2476

++	bfqq->queued[sync]--;

2477

++	bfqd->queued--;

2478

++	elv_rb_del(&bfqq->sort_list, rq);

2479

++

2480

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2481

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

2482

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

2483

++		/*

2484

++		 * Remove queue from request-position tree as it is empty.

2485

++		 */

2486

++		if (bfqq->pos_root) {

2487

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

2488

++			bfqq->pos_root = NULL;

2489

++		}

2490

++	}

2491

++

2492

++	if (rq->cmd_flags & REQ_META) {

2493

++		BUG_ON(bfqq->meta_pending == 0);

2494

++		bfqq->meta_pending--;

2495

++	}

2496

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2497

++	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);

2498

++#endif

2499

++}

2500

++

2501

++static int bfq_merge(struct request_queue *q, struct request **req,

2502

++		     struct bio *bio)

2503

++{

2504

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2505

++	struct request *__rq;

2506

++

2507

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

2508

++	if (__rq && elv_rq_merge_ok(__rq, bio)) {

2509

++		*req = __rq;

2510

++		return ELEVATOR_FRONT_MERGE;

2511

++	}

2512

++

2513

++	return ELEVATOR_NO_MERGE;

2514

++}

2515

++

2516

++static void bfq_merged_request(struct request_queue *q, struct request *req,

2517

++			       int type)

2518

++{

2519

++	if (type == ELEVATOR_FRONT_MERGE &&

2520

++	    rb_prev(&req->rb_node) &&

2521

++	    blk_rq_pos(req) <

2522

++	    blk_rq_pos(container_of(rb_prev(&req->rb_node),

2523

++				    struct request, rb_node))) {

2524

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

2525

++		struct bfq_data *bfqd = bfqq->bfqd;

2526

++		struct request *prev, *next_rq;

2527

++

2528

++		/* Reposition request in its sort_list */

2529

++		elv_rb_del(&bfqq->sort_list, req);

2530

++		elv_rb_add(&bfqq->sort_list, req);

2531

++		/* Choose next request to be served for bfqq */

2532

++		prev = bfqq->next_rq;

2533

++		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,

2534

++					 bfqd->last_position);

2535

++		BUG_ON(!next_rq);

2536

++		bfqq->next_rq = next_rq;

2537

++	}

2538

++}

2539

++

2540

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2541

++static void bfq_bio_merged(struct request_queue *q, struct request *req,

2542

++			   struct bio *bio)

2543

++{

2544

++	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);

2545

++}

2546

++#endif

2547

++

2548

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2549

++				struct request *next)

2550

++{

2551

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);

2552

++

2553

++	/*

2554

++	 * If next and rq belong to the same bfq_queue and next is older

2555

++	 * than rq, then reposition rq in the fifo (by substituting next

2556

++	 * with rq). Otherwise, if next and rq belong to different

2557

++	 * bfq_queues, never reposition rq: in fact, we would have to

2558

++	 * reposition it with respect to next's position in its own fifo,

2559

++	 * which would most certainly be too expensive with respect to

2560

++	 * the benefits.

2561

++	 */

2562

++	if (bfqq == next_bfqq &&

2563

++	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2564

++	    time_before(next->fifo_time, rq->fifo_time)) {

2565

++		list_del_init(&rq->queuelist);

2566

++		list_replace_init(&next->queuelist, &rq->queuelist);

2567

++		rq->fifo_time = next->fifo_time;

2568

++	}

2569

++

2570

++	if (bfqq->next_rq == next)

2571

++		bfqq->next_rq = rq;

2572

++

2573

++	bfq_remove_request(next);

2574

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2575

++	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);

2576

++#endif

2577

++}

2578

++

2579

++/* Must be called with bfqq != NULL */

2580

++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

2581

++{

2582

++	BUG_ON(!bfqq);

2583

++	if (bfq_bfqq_busy(bfqq))

2584

++		bfqq->bfqd->wr_busy_queues--;

2585

++	bfqq->wr_coeff = 1;

2586

++	bfqq->wr_cur_max_time = 0;

2587

++	/* Trigger a weight change on the next activation of the queue */

2588

++	bfqq->entity.prio_changed = 1;

2589

++}

2590

++

2591

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

2592

++				    struct bfq_group *bfqg)

2593

++{

2594

++	int i, j;

2595

++

2596

++	for (i = 0; i < 2; i++)

2597

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2598

++			if (bfqg->async_bfqq[i][j])

2599

++				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);

2600

++	if (bfqg->async_idle_bfqq)

2601

++		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);

2602

++}

2603

++

2604

++static void bfq_end_wr(struct bfq_data *bfqd)

2605

++{

2606

++	struct bfq_queue *bfqq;

2607

++

2608

++	spin_lock_irq(bfqd->queue->queue_lock);

2609

++

2610

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2611

++		bfq_bfqq_end_wr(bfqq);

2612

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2613

++		bfq_bfqq_end_wr(bfqq);

2614

++	bfq_end_wr_async(bfqd);

2615

++

2616

++	spin_unlock_irq(bfqd->queue->queue_lock);

2617

++}

2618

++

2619

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2620

++			   struct bio *bio)

2621

++{

2622

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2623

++	struct bfq_io_cq *bic;

2624

++

2625

++	/*

2626

++	 * Disallow merge of a sync bio into an async request.

2627

++	 */

2628

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2629

++		return 0;

2630

++

2631

++	/*

2632

++	 * Lookup the bfqq that this bio will be queued with. Allow

2633

++	 * merge only if rq is queued there.

2634

++	 * Queue lock is held here.

2635

++	 */

2636

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2637

++	if (!bic)

2638

++		return 0;

2639

++

2640

++	return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq);

2641

++}

2642

++

2643

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2644

++				       struct bfq_queue *bfqq)

2645

++{

2646

++	if (bfqq) {

2647

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2648

++		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));

2649

++#endif

2650

++		bfq_mark_bfqq_must_alloc(bfqq);

2651

++		bfq_mark_bfqq_budget_new(bfqq);

2652

++		bfq_clear_bfqq_fifo_expire(bfqq);

2653

++

2654

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2655

++

2656

++		bfq_log_bfqq(bfqd, bfqq,

2657

++			     "set_in_service_queue, cur-budget = %d",

2658

++			     bfqq->entity.budget);

2659

++	}

2660

++

2661

++	bfqd->in_service_queue = bfqq;

2662

++}

2663

++

2664

++/*

2665

++ * Get and set a new queue for service.

2666

++ */

2667

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

2668

++{

2669

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

2670

++

2671

++	__bfq_set_in_service_queue(bfqd, bfqq);

2672

++	return bfqq;

2673

++}

2674

++

2675

++/*

2676

++ * If enough samples have been computed, return the current max budget

2677

++ * stored in bfqd, which is dynamically updated according to the

2678

++ * estimated disk peak rate; otherwise return the default max budget

2679

++ */

2680

++static int bfq_max_budget(struct bfq_data *bfqd)

2681

++{

2682

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

2683

++		return bfq_default_max_budget;

2684

++	else

2685

++		return bfqd->bfq_max_budget;

2686

++}

2687

++

2688

++/*

2689

++ * Return min budget, which is a fraction of the current or default

2690

++ * max budget (trying with 1/32)

2691

++ */

2692

++static int bfq_min_budget(struct bfq_data *bfqd)

2693

++{

2694

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

2695

++		return bfq_default_max_budget / 32;

2696

++	else

2697

++		return bfqd->bfq_max_budget / 32;

2698

++}

2699

++

2700

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2701

++{

2702

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2703

++	struct bfq_io_cq *bic;

2704

++	unsigned long sl;

2705

++

2706

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2707

++

2708

++	/* Processes have exited, don't wait. */

2709

++	bic = bfqd->in_service_bic;

2710

++	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)

2711

++		return;

2712

++

2713

++	bfq_mark_bfqq_wait_request(bfqq);

2714

++

2715

++	/*

2716

++	 * We don't want to idle for seeks, but we do want to allow

2717

++	 * fair distribution of slice time for a process doing back-to-back

2718

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2719

++	 *

2720

++	 * To prevent processes with (partly) seeky workloads from

2721

++	 * being too ill-treated, grant them a small fraction of the

2722

++	 * assigned budget before reducing the waiting time to

2723

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2724

++	 */

2725

++	sl = bfqd->bfq_slice_idle;

2726

++	/*

2727

++	 * Unless the queue is being weight-raised or the scenario is

2728

++	 * asymmetric, grant only minimum idle time if the queue either

2729

++	 * has been seeky for long enough or has already proved to be

2730

++	 * constantly seeky.

2731

++	 */

2732

++	if (bfq_sample_valid(bfqq->seek_samples) &&

2733

++	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

2734

++				  bfq_max_budget(bfqq->bfqd) / 8) ||

2735

++	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&

2736

++	    bfq_symmetric_scenario(bfqd))

2737

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2738

++	else if (bfqq->wr_coeff > 1)

2739

++		sl = sl * 3;

2740

++	bfqd->last_idling_start = ktime_get();

2741

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2742

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2743

++	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));

2744

++#endif

2745

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2746

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2747

++}

2748

++

2749

++/*

2750

++ * Set the maximum time for the in-service queue to consume its

2751

++ * budget. This prevents seeky processes from lowering the disk

2752

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2753

++ */

2754

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2755

++{

2756

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2757

++	unsigned int timeout_coeff;

2758

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

2759

++		timeout_coeff = 1;

2760

++	else

2761

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2762

++

2763

++	bfqd->last_budget_start = ktime_get();

2764

++

2765

++	bfq_clear_bfqq_budget_new(bfqq);

2766

++	bfqq->budget_timeout = jiffies +

2767

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2768

++

2769

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2770

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2771

++		timeout_coeff));

2772

++}

2773

++

2774

++/*

2775

++ * Move request from internal lists to the request queue dispatch list.

2776

++ */

2777

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2778

++{

2779

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2780

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2781

++

2782

++	/*

2783

++	 * For consistency, the next instruction should have been executed

2784

++	 * after removing the request from the queue and dispatching it.

2785

++	 * We execute instead this instruction before bfq_remove_request()

2786

++	 * (and hence introduce a temporary inconsistency), for efficiency.

2787

++	 * In fact, in a forced_dispatch, this prevents two counters related

2788

++	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq

2789

++	 * is not in service, and then to be incremented again after

2790

++	 * incrementing bfqq->dispatched.

2791

++	 */

2792

++	bfqq->dispatched++;

2793

++	bfq_remove_request(rq);

2794

++	elv_dispatch_sort(q, rq);

2795

++

2796

++	if (bfq_bfqq_sync(bfqq))

2797

++		bfqd->sync_flight++;

2798

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2799

++	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),

2800

++				   rq->cmd_flags);

2801

++#endif

2802

++}

2803

++

2804

++/*

2805

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2806

++ */

2807

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2808

++{

2809

++	struct request *rq = NULL;

2810

++

2811

++	if (bfq_bfqq_fifo_expire(bfqq))

2812

++		return NULL;

2813

++

2814

++	bfq_mark_bfqq_fifo_expire(bfqq);

2815

++

2816

++	if (list_empty(&bfqq->fifo))

2817

++		return NULL;

2818

++

2819

++	rq = rq_entry_fifo(bfqq->fifo.next);

2820

++

2821

++	if (time_before(jiffies, rq->fifo_time))

2822

++		return NULL;

2823

++

2824

++	return rq;

2825

++}

2826

++

2827

++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2828

++{

2829

++	struct bfq_entity *entity = &bfqq->entity;

2830

++	return entity->budget - entity->service;

2831

++}

2832

++

2833

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2834

++{

2835

++	BUG_ON(bfqq != bfqd->in_service_queue);

2836

++

2837

++	__bfq_bfqd_reset_in_service(bfqd);

2838

++

2839

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2840

++		/*

2841

++		 * Overloading budget_timeout field to store the time

2842

++		 * at which the queue remains with no backlog; used by

2843

++		 * the weight-raising mechanism.

2844

++		 */

2845

++		bfqq->budget_timeout = jiffies;

2846

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2847

++	} else

2848

++		bfq_activate_bfqq(bfqd, bfqq);

2849

++}

2850

++

2851

++/**

2852

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2853

++ * @bfqd: device data.

2854

++ * @bfqq: queue to update.

2855

++ * @reason: reason for expiration.

2856

++ *

2857

++ * Handle the feedback on @bfqq budget at queue expiration.

2858

++ * See the body for detailed comments.

2859

++ */

2860

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2861

++				     struct bfq_queue *bfqq,

2862

++				     enum bfqq_expiration reason)

2863

++{

2864

++	struct request *next_rq;

2865

++	int budget, min_budget;

2866

++

2867

++	budget = bfqq->max_budget;

2868

++	min_budget = bfq_min_budget(bfqd);

2869

++

2870

++	BUG_ON(bfqq != bfqd->in_service_queue);

2871

++

2872

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",

2873

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2874

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",

2875

++		budget, bfq_min_budget(bfqd));

2876

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2877

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2878

++

2879

++	if (bfq_bfqq_sync(bfqq)) {

2880

++		switch (reason) {

2881

++		/*

2882

++		 * Caveat: in all the following cases we trade latency

2883

++		 * for throughput.

2884

++		 */

2885

++		case BFQ_BFQQ_TOO_IDLE:

2886

++			/*

2887

++			 * This is the only case where we may reduce

2888

++			 * the budget: if there is no request of the

2889

++			 * process still waiting for completion, then

2890

++			 * we assume (tentatively) that the timer has

2891

++			 * expired because the batch of requests of

2892

++			 * the process could have been served with a

2893

++			 * smaller budget.  Hence, betting that

2894

++			 * process will behave in the same way when it

2895

++			 * becomes backlogged again, we reduce its

2896

++			 * next budget.  As long as we guess right,

2897

++			 * this budget cut reduces the latency

2898

++			 * experienced by the process.

2899

++			 *

2900

++			 * However, if there are still outstanding

2901

++			 * requests, then the process may have not yet

2902

++			 * issued its next request just because it is

2903

++			 * still waiting for the completion of some of

2904

++			 * the still outstanding ones.  So in this

2905

++			 * subcase we do not reduce its budget, on the

2906

++			 * contrary we increase it to possibly boost

2907

++			 * the throughput, as discussed in the

2908

++			 * comments to the BUDGET_TIMEOUT case.

2909

++			 */

2910

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2911

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2912

++			else {

2913

++				if (budget > 5 * min_budget)

2914

++					budget -= 4 * min_budget;

2915

++				else

2916

++					budget = min_budget;

2917

++			}

2918

++			break;

2919

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2920

++			/*

2921

++			 * We double the budget here because: 1) it

2922

++			 * gives the chance to boost the throughput if

2923

++			 * this is not a seeky process (which may have

2924

++			 * bumped into this timeout because of, e.g.,

2925

++			 * ZBR), 2) together with charge_full_budget

2926

++			 * it helps give seeky processes higher

2927

++			 * timestamps, and hence be served less

2928

++			 * frequently.

2929

++			 */

2930

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2931

++			break;

2932

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2933

++			/*

2934

++			 * The process still has backlog, and did not

2935

++			 * let either the budget timeout or the disk

2936

++			 * idling timeout expire. Hence it is not

2937

++			 * seeky, has a short thinktime and may be

2938

++			 * happy with a higher budget too. So

2939

++			 * definitely increase the budget of this good

2940

++			 * candidate to boost the disk throughput.

2941

++			 */

2942

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2943

++			break;

2944

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2945

++		       /*

2946

++			* Leave the budget unchanged.

2947

++			*/

2948

++		default:

2949

++			return;

2950

++		}

2951

++	} else

2952

++		/*

2953

++		 * Async queues get always the maximum possible budget

2954

++		 * (their ability to dispatch is limited by

2955

++		 * @bfqd->bfq_max_budget_async_rq).

2956

++		 */

2957

++		budget = bfqd->bfq_max_budget;

2958

++

2959

++	bfqq->max_budget = budget;

2960

++

2961

++	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&

2962

++	    !bfqd->bfq_user_max_budget)

2963

++		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);

2964

++

2965

++	/*

2966

++	 * Make sure that we have enough budget for the next request.

2967

++	 * Since the finish time of the bfqq must be kept in sync with

2968

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2969

++	 * update.

2970

++	 */

2971

++	next_rq = bfqq->next_rq;

2972

++	if (next_rq)

2973

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2974

++					    bfq_serv_to_charge(next_rq, bfqq));

2975

++	else

2976

++		bfqq->entity.budget = bfqq->max_budget;

2977

++

2978

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",

2979

++			next_rq ? blk_rq_sectors(next_rq) : 0,

2980

++			bfqq->entity.budget);

2981

++}

2982

++

2983

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2984

++{

2985

++	unsigned long max_budget;

2986

++

2987

++	/*

2988

++	 * The max_budget calculated when autotuning is equal to the

2989

++	 * amount of sectors transfered in timeout_sync at the

2990

++	 * estimated peak rate.

2991

++	 */

2992

++	max_budget = (unsigned long)(peak_rate * 1000 *

2993

++				     timeout >> BFQ_RATE_SHIFT);

2994

++

2995

++	return max_budget;

2996

++}

2997

++

2998

++/*

2999

++ * In addition to updating the peak rate, checks whether the process

3000

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

3001

++ * to the budget timeout, to reduce the amount of service provided to

3002

++ * seeky processes, and hence reduce their chances to lower the

3003

++ * throughput. See the code for more details.

3004

++ */

3005

++static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3006

++				 bool compensate, enum bfqq_expiration reason)

3007

++{

3008

++	u64 bw, usecs, expected, timeout;

3009

++	ktime_t delta;

3010

++	int update = 0;

3011

++

3012

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

3013

++		return false;

3014

++

3015

++	if (compensate)

3016

++		delta = bfqd->last_idling_start;

3017

++	else

3018

++		delta = ktime_get();

3019

++	delta = ktime_sub(delta, bfqd->last_budget_start);

3020

++	usecs = ktime_to_us(delta);

3021

++

3022

++	/* Don't trust short/unrealistic values. */

3023

++	if (usecs < 100 || usecs >= LONG_MAX)

3024

++		return false;

3025

++

3026

++	/*

3027

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

3028

++	 * value to store the peak rate, in sectors per usec in fixed

3029

++	 * point math.  We do so to have enough precision in the estimate

3030

++	 * and to avoid overflows.

3031

++	 */

3032

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

3033

++	do_div(bw, (unsigned long)usecs);

3034

++

3035

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

3036

++

3037

++	/*

3038

++	 * Use only long (> 20ms) intervals to filter out spikes for

3039

++	 * the peak rate estimation.

3040

++	 */

3041

++	if (usecs > 20000) {

3042

++		if (bw > bfqd->peak_rate ||

3043

++		   (!BFQQ_SEEKY(bfqq) &&

3044

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

3045

++			bfq_log(bfqd, "measured bw =%llu", bw);

3046

++			/*

3047

++			 * To smooth oscillations use a low-pass filter with

3048

++			 * alpha=7/8, i.e.,

3049

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

3050

++			 */

3051

++			do_div(bw, 8);

3052

++			if (bw == 0)

3053

++				return 0;

3054

++			bfqd->peak_rate *= 7;

3055

++			do_div(bfqd->peak_rate, 8);

3056

++			bfqd->peak_rate += bw;

3057

++			update = 1;

3058

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

3059

++		}

3060

++

3061

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

3062

++

3063

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

3064

++			bfqd->peak_rate_samples++;

3065

++

3066

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

3067

++		    update) {

3068

++			int dev_type = blk_queue_nonrot(bfqd->queue);

3069

++			if (bfqd->bfq_user_max_budget == 0) {

3070

++				bfqd->bfq_max_budget =

3071

++					bfq_calc_max_budget(bfqd->peak_rate,

3072

++							    timeout);

3073

++				bfq_log(bfqd, "new max_budget=%d",

3074

++					bfqd->bfq_max_budget);

3075

++			}

3076

++			if (bfqd->device_speed == BFQ_BFQD_FAST &&

3077

++			    bfqd->peak_rate < device_speed_thresh[dev_type]) {

3078

++				bfqd->device_speed = BFQ_BFQD_SLOW;

3079

++				bfqd->RT_prod = R_slow[dev_type] *

3080

++						T_slow[dev_type];

3081

++			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&

3082

++			    bfqd->peak_rate > device_speed_thresh[dev_type]) {

3083

++				bfqd->device_speed = BFQ_BFQD_FAST;

3084

++				bfqd->RT_prod = R_fast[dev_type] *

3085

++						T_fast[dev_type];

3086

++			}

3087

++		}

3088

++	}

3089

++

3090

++	/*

3091

++	 * If the process has been served for a too short time

3092

++	 * interval to let its possible sequential accesses prevail on

3093

++	 * the initial seek time needed to move the disk head on the

3094

++	 * first sector it requested, then give the process a chance

3095

++	 * and for the moment return false.

3096

++	 */

3097

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

3098

++		return false;

3099

++

3100

++	/*

3101

++	 * A process is considered ``slow'' (i.e., seeky, so that we

3102

++	 * cannot treat it fairly in the service domain, as it would

3103

++	 * slow down too much the other processes) if, when a slice

3104

++	 * ends for whatever reason, it has received service at a

3105

++	 * rate that would not be high enough to complete the budget

3106

++	 * before the budget timeout expiration.

3107

++	 */

3108

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

3109

++

3110

++	/*

3111

++	 * Caveat: processes doing IO in the slower disk zones will

3112

++	 * tend to be slow(er) even if not seeky. And the estimated

3113

++	 * peak rate will actually be an average over the disk

3114

++	 * surface. Hence, to not be too harsh with unlucky processes,

3115

++	 * we keep a budget/3 margin of safety before declaring a

3116

++	 * process slow.

3117

++	 */

3118

++	return expected > (4 * bfqq->entity.budget) / 3;

3119

++}

3120

++

3121

++/*

3122

++ * To be deemed as soft real-time, an application must meet two

3123

++ * requirements. First, the application must not require an average

3124

++ * bandwidth higher than the approximate bandwidth required to playback or

3125

++ * record a compressed high-definition video.

3126

++ * The next function is invoked on the completion of the last request of a

3127

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

3128

++ * that, if the next request of the application does not arrive before

3129

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

3130

++ *

3131

++ * The second requirement is that the request pattern of the application is

3132

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

3133

++ * the application stops issuing new requests until all its pending requests

3134

++ * have been completed. After that, the application may issue a new batch,

3135

++ * and so on.

3136

++ * For this reason the next function is invoked to compute

3137

++ * soft_rt_next_start only for applications that meet this requirement,

3138

++ * whereas soft_rt_next_start is set to infinity for applications that do

3139

++ * not.

3140

++ *

3141

++ * Unfortunately, even a greedy application may happen to behave in an

3142

++ * isochronous way if the CPU load is high. In fact, the application may

3143

++ * stop issuing requests while the CPUs are busy serving other processes,

3144

++ * then restart, then stop again for a while, and so on. In addition, if

3145

++ * the disk achieves a low enough throughput with the request pattern

3146

++ * issued by the application (e.g., because the request pattern is random

3147

++ * and/or the device is slow), then the application may meet the above

3148

++ * bandwidth requirement too. To prevent such a greedy application to be

3149

++ * deemed as soft real-time, a further rule is used in the computation of

3150

++ * soft_rt_next_start: soft_rt_next_start must be higher than the current

3151

++ * time plus the maximum time for which the arrival of a request is waited

3152

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.

3153

++ * This filters out greedy applications, as the latter issue instead their

3154

++ * next request as soon as possible after the last one has been completed

3155

++ * (in contrast, when a batch of requests is completed, a soft real-time

3156

++ * application spends some time processing data).

3157

++ *

3158

++ * Unfortunately, the last filter may easily generate false positives if

3159

++ * only bfqd->bfq_slice_idle is used as a reference time interval and one

3160

++ * or both the following cases occur:

3161

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

3162

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

3163

++ *    HZ=100.

3164

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

3165

++ *    for a while, then suddenly 'jump' by several units to recover the lost

3166

++ *    increments. This seems to happen, e.g., inside virtual machines.

3167

++ * To address this issue, we do not use as a reference time interval just

3168

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

3169

++ * particular we add the minimum number of jiffies for which the filter

3170

++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual

3171

++ * machines.

3172

++ */

3173

++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

3174

++						struct bfq_queue *bfqq)

3175

++{

3176

++	return max(bfqq->last_idle_bklogged +

3177

++		   HZ * bfqq->service_from_backlogged /

3178

++		   bfqd->bfq_wr_max_softrt_rate,

3179

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

3180

++}

3181

++

3182

++/*

3183

++ * Return the largest-possible time instant such that, for as long as possible,

3184

++ * the current time will be lower than this time instant according to the macro

3185

++ * time_is_before_jiffies().

3186

++ */

3187

++static unsigned long bfq_infinity_from_now(unsigned long now)

3188

++{

3189

++	return now + ULONG_MAX / 2;

3190

++}

3191

++

3192

++/**

3193

++ * bfq_bfqq_expire - expire a queue.

3194

++ * @bfqd: device owning the queue.

3195

++ * @bfqq: the queue to expire.

3196

++ * @compensate: if true, compensate for the time spent idling.

3197

++ * @reason: the reason causing the expiration.

3198

++ *

3199

++ *

3200

++ * If the process associated to the queue is slow (i.e., seeky), or in

3201

++ * case of budget timeout, or, finally, if it is async, we

3202

++ * artificially charge it an entire budget (independently of the

3203

++ * actual service it received). As a consequence, the queue will get

3204

++ * higher timestamps than the correct ones upon reactivation, and

3205

++ * hence it will be rescheduled as if it had received more service

3206

++ * than what it actually received. In the end, this class of processes

3207

++ * will receive less service in proportion to how slowly they consume

3208

++ * their budgets (and hence how seriously they tend to lower the

3209

++ * throughput).

3210

++ *

3211

++ * In contrast, when a queue expires because it has been idling for

3212

++ * too much or because it exhausted its budget, we do not touch the

3213

++ * amount of service it has received. Hence when the queue will be

3214

++ * reactivated and its timestamps updated, the latter will be in sync

3215

++ * with the actual service received by the queue until expiration.

3216

++ *

3217

++ * Charging a full budget to the first type of queues and the exact

3218

++ * service to the others has the effect of using the WF2Q+ policy to

3219

++ * schedule the former on a timeslice basis, without violating the

3220

++ * service domain guarantees of the latter.

3221

++ */

3222

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

3223

++			    struct bfq_queue *bfqq,

3224

++			    bool compensate,

3225

++			    enum bfqq_expiration reason)

3226

++{

3227

++	bool slow;

3228

++	BUG_ON(bfqq != bfqd->in_service_queue);

3229

++

3230

++	/*

3231

++	 * Update disk peak rate for autotuning and check whether the

3232

++	 * process is slow (see bfq_update_peak_rate).

3233

++	 */

3234

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

3235

++

3236

++	/*

3237

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

3238

++	 * and async queues, to favor sequential sync workloads.

3239

++	 *

3240

++	 * Processes doing I/O in the slower disk zones will tend to be

3241

++	 * slow(er) even if not seeky. Hence, since the estimated peak

3242

++	 * rate is actually an average over the disk surface, these

3243

++	 * processes may timeout just for bad luck. To avoid punishing

3244

++	 * them we do not charge a full budget to a process that

3245

++	 * succeeded in consuming at least 2/3 of its budget.

3246

++	 */

3247

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3248

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

3249

++		bfq_bfqq_charge_full_budget(bfqq);

3250

++

3251

++	bfqq->service_from_backlogged += bfqq->entity.service;

3252

++

3253

++	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3254

++	    !bfq_bfqq_constantly_seeky(bfqq)) {

3255

++		bfq_mark_bfqq_constantly_seeky(bfqq);

3256

++		if (!blk_queue_nonrot(bfqd->queue))

3257

++			bfqd->const_seeky_busy_in_flight_queues++;

3258

++	}

3259

++

3260

++	if (reason == BFQ_BFQQ_TOO_IDLE &&

3261

++	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

3262

++		bfq_clear_bfqq_IO_bound(bfqq);

3263

++

3264

++	if (bfqd->low_latency && bfqq->wr_coeff == 1)

3265

++		bfqq->last_wr_start_finish = jiffies;

3266

++

3267

++	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

3268

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

3269

++		/*

3270

++		 * If we get here, and there are no outstanding requests,

3271

++		 * then the request pattern is isochronous (see the comments

3272

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we

3273

++		 * can compute soft_rt_next_start. If, instead, the queue

3274

++		 * still has outstanding requests, then we have to wait

3275

++		 * for the completion of all the outstanding requests to

3276

++		 * discover whether the request pattern is actually

3277

++		 * isochronous.

3278

++		 */

3279

++		if (bfqq->dispatched == 0)

3280

++			bfqq->soft_rt_next_start =

3281

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

3282

++		else {

3283

++			/*

3284

++			 * The application is still waiting for the

3285

++			 * completion of one or more requests:

3286

++			 * prevent it from possibly being incorrectly

3287

++			 * deemed as soft real-time by setting its

3288

++			 * soft_rt_next_start to infinity. In fact,

3289

++			 * without this assignment, the application

3290

++			 * would be incorrectly deemed as soft

3291

++			 * real-time if:

3292

++			 * 1) it issued a new request before the

3293

++			 *    completion of all its in-flight

3294

++			 *    requests, and

3295

++			 * 2) at that time, its soft_rt_next_start

3296

++			 *    happened to be in the past.

3297

++			 */

3298

++			bfqq->soft_rt_next_start =

3299

++				bfq_infinity_from_now(jiffies);

3300

++			/*

3301

++			 * Schedule an update of soft_rt_next_start to when

3302

++			 * the task may be discovered to be isochronous.

3303

++			 */

3304

++			bfq_mark_bfqq_softrt_update(bfqq);

3305

++		}

3306

++	}

3307

++

3308

++	bfq_log_bfqq(bfqd, bfqq,

3309

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

3310

++		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

3311

++

3312

++	/*

3313

++	 * Increase, decrease or leave budget unchanged according to

3314

++	 * reason.

3315

++	 */

3316

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

3317

++	__bfq_bfqq_expire(bfqd, bfqq);

3318

++}

3319

++

3320

++/*

3321

++ * Budget timeout is not implemented through a dedicated timer, but

3322

++ * just checked on request arrivals and completions, as well as on

3323

++ * idle timer expirations.

3324

++ */

3325

++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

3326

++{

3327

++	if (bfq_bfqq_budget_new(bfqq) ||

3328

++	    time_before(jiffies, bfqq->budget_timeout))

3329

++		return false;

3330

++	return true;

3331

++}

3332

++

3333

++/*

3334

++ * If we expire a queue that is waiting for the arrival of a new

3335

++ * request, we may prevent the fictitious timestamp back-shifting that

3336

++ * allows the guarantees of the queue to be preserved (see [1] for

3337

++ * this tricky aspect). Hence we return true only if this condition

3338

++ * does not hold, or if the queue is slow enough to deserve only to be

3339

++ * kicked off for preserving a high throughput.

3340

++*/

3341

++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

3342

++{

3343

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

3344

++		"may_budget_timeout: wait_request %d left %d timeout %d",

3345

++		bfq_bfqq_wait_request(bfqq),

3346

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

3347

++		bfq_bfqq_budget_timeout(bfqq));

3348

++

3349

++	return (!bfq_bfqq_wait_request(bfqq) ||

3350

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

3351

++		&&

3352

++		bfq_bfqq_budget_timeout(bfqq);

3353

++}

3354

++

3355

++/*

3356

++ * For a queue that becomes empty, device idling is allowed only if

3357

++ * this function returns true for that queue. As a consequence, since

3358

++ * device idling plays a critical role for both throughput boosting

3359

++ * and service guarantees, the return value of this function plays a

3360

++ * critical role as well.

3361

++ *

3362

++ * In a nutshell, this function returns true only if idling is

3363

++ * beneficial for throughput or, even if detrimental for throughput,

3364

++ * idling is however necessary to preserve service guarantees (low

3365

++ * latency, desired throughput distribution, ...). In particular, on

3366

++ * NCQ-capable devices, this function tries to return false, so as to

3367

++ * help keep the drives' internal queues full, whenever this helps the

3368

++ * device boost the throughput without causing any service-guarantee

3369

++ * issue.

3370

++ *

3371

++ * In more detail, the return value of this function is obtained by,

3372

++ * first, computing a number of boolean variables that take into

3373

++ * account throughput and service-guarantee issues, and, then,

3374

++ * combining these variables in a logical expression. Most of the

3375

++ * issues taken into account are not trivial. We discuss these issues

3376

++ * while introducing the variables.

3377

++ */

3378

++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

3379

++{

3380

++	struct bfq_data *bfqd = bfqq->bfqd;

3381

++	bool idling_boosts_thr, idling_boosts_thr_without_issues,

3382

++		all_queues_seeky, on_hdd_and_not_all_queues_seeky,

3383

++		idling_needed_for_service_guarantees,

3384

++		asymmetric_scenario;

3385

++

3386

++	/*

3387

++	 * The next variable takes into account the cases where idling

3388

++	 * boosts the throughput.

3389

++	 *

3390

++	 * The value of the variable is computed considering, first, that

3391

++	 * idling is virtually always beneficial for the throughput if:

3392

++	 * (a) the device is not NCQ-capable, or

3393

++	 * (b) regardless of the presence of NCQ, the device is rotational

3394

++	 *     and the request pattern for bfqq is I/O-bound and sequential.

3395

++	 *

3396

++	 * Secondly, and in contrast to the above item (b), idling an

3397

++	 * NCQ-capable flash-based device would not boost the

3398

++	 * throughput even with sequential I/O; rather it would lower

3399

++	 * the throughput in proportion to how fast the device

3400

++	 * is. Accordingly, the next variable is true if any of the

3401

++	 * above conditions (a) and (b) is true, and, in particular,

3402

++	 * happens to be false if bfqd is an NCQ-capable flash-based

3403

++	 * device.

3404

++	 */

3405

++	idling_boosts_thr = !bfqd->hw_tag ||

3406

++		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&

3407

++		 bfq_bfqq_idle_window(bfqq)) ;

3408

++

3409

++	/*

3410

++	 * The value of the next variable,

3411

++	 * idling_boosts_thr_without_issues, is equal to that of

3412

++	 * idling_boosts_thr, unless a special case holds. In this

3413

++	 * special case, described below, idling may cause problems to

3414

++	 * weight-raised queues.

3415

++	 *

3416

++	 * When the request pool is saturated (e.g., in the presence

3417

++	 * of write hogs), if the processes associated with

3418

++	 * non-weight-raised queues ask for requests at a lower rate,

3419

++	 * then processes associated with weight-raised queues have a

3420

++	 * higher probability to get a request from the pool

3421

++	 * immediately (or at least soon) when they need one. Thus

3422

++	 * they have a higher probability to actually get a fraction

3423

++	 * of the device throughput proportional to their high

3424

++	 * weight. This is especially true with NCQ-capable drives,

3425

++	 * which enqueue several requests in advance, and further

3426

++	 * reorder internally-queued requests.

3427

++	 *

3428

++	 * For this reason, we force to false the value of

3429

++	 * idling_boosts_thr_without_issues if there are weight-raised

3430

++	 * busy queues. In this case, and if bfqq is not weight-raised,

3431

++	 * this guarantees that the device is not idled for bfqq (if,

3432

++	 * instead, bfqq is weight-raised, then idling will be

3433

++	 * guaranteed by another variable, see below). Combined with

3434

++	 * the timestamping rules of BFQ (see [1] for details), this

3435

++	 * behavior causes bfqq, and hence any sync non-weight-raised

3436

++	 * queue, to get a lower number of requests served, and thus

3437

++	 * to ask for a lower number of requests from the request

3438

++	 * pool, before the busy weight-raised queues get served

3439

++	 * again. This often mitigates starvation problems in the

3440

++	 * presence of heavy write workloads and NCQ, thereby

3441

++	 * guaranteeing a higher application and system responsiveness

3442

++	 * in these hostile scenarios.

3443

++	 */

3444

++	idling_boosts_thr_without_issues = idling_boosts_thr &&

3445

++		bfqd->wr_busy_queues == 0;

3446

++

3447

++	/*

3448

++	 * There are then two cases where idling must be performed not

3449

++	 * for throughput concerns, but to preserve service

3450

++	 * guarantees. In the description of these cases, we say, for

3451

++	 * short, that a queue is sequential/random if the process

3452

++	 * associated to the queue issues sequential/random requests

3453

++	 * (in the second case the queue may be tagged as seeky or

3454

++	 * even constantly_seeky).

3455

++	 *

3456

++	 * To introduce the first case, we note that, since

3457

++	 * bfq_bfqq_idle_window(bfqq) is false if the device is

3458

++	 * NCQ-capable and bfqq is random (see

3459

++	 * bfq_update_idle_window()), then, from the above two

3460

++	 * assignments it follows that

3461

++	 * idling_boosts_thr_without_issues is false if the device is

3462

++	 * NCQ-capable and bfqq is random. Therefore, for this case,

3463

++	 * device idling would never be allowed if we used just

3464

++	 * idling_boosts_thr_without_issues to decide whether to allow

3465

++	 * it. And, beneficially, this would imply that throughput

3466

++	 * would always be boosted also with random I/O on NCQ-capable

3467

++	 * HDDs.

3468

++	 *

3469

++	 * But we must be careful on this point, to avoid an unfair

3470

++	 * treatment for bfqq. In fact, because of the same above

3471

++	 * assignments, idling_boosts_thr_without_issues is, on the

3472

++	 * other hand, true if 1) the device is an HDD and bfqq is

3473

++	 * sequential, and 2) there are no busy weight-raised

3474

++	 * queues. As a consequence, if we used just

3475

++	 * idling_boosts_thr_without_issues to decide whether to idle

3476

++	 * the device, then with an HDD we might easily bump into a

3477

++	 * scenario where queues that are sequential and I/O-bound

3478

++	 * would enjoy idling, whereas random queues would not. The

3479

++	 * latter might then get a low share of the device throughput,

3480

++	 * simply because the former would get many requests served

3481

++	 * after being set as in service, while the latter would not.

3482

++	 *

3483

++	 * To address this issue, we start by setting to true a

3484

++	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the

3485

++	 * device is rotational and not all queues with pending or

3486

++	 * in-flight requests are constantly seeky (i.e., there are

3487

++	 * active sequential queues, and bfqq might then be mistreated

3488

++	 * if it does not enjoy idling because it is random).

3489

++	 */

3490

++	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&

3491

++			   bfqd->busy_in_flight_queues ==

3492

++			   bfqd->const_seeky_busy_in_flight_queues;

3493

++

3494

++	on_hdd_and_not_all_queues_seeky =

3495

++		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;

3496

++

3497

++	/*

3498

++	 * To introduce the second case where idling needs to be

3499

++	 * performed to preserve service guarantees, we can note that

3500

++	 * allowing the drive to enqueue more than one request at a

3501

++	 * time, and hence delegating de facto final scheduling

3502

++	 * decisions to the drive's internal scheduler, causes loss of

3503

++	 * control on the actual request service order. In particular,

3504

++	 * the critical situation is when requests from different

3505

++	 * processes happens to be present, at the same time, in the

3506

++	 * internal queue(s) of the drive. In such a situation, the

3507

++	 * drive, by deciding the service order of the

3508

++	 * internally-queued requests, does determine also the actual

3509

++	 * throughput distribution among these processes. But the

3510

++	 * drive typically has no notion or concern about per-process

3511

++	 * throughput distribution, and makes its decisions only on a

3512

++	 * per-request basis. Therefore, the service distribution

3513

++	 * enforced by the drive's internal scheduler is likely to

3514

++	 * coincide with the desired device-throughput distribution

3515

++	 * only in a completely symmetric scenario where:

3516

++	 * (i)  each of these processes must get the same throughput as

3517

++	 *      the others;

3518

++	 * (ii) all these processes have the same I/O pattern

3519

++	        (either sequential or random).

3520

++	 * In fact, in such a scenario, the drive will tend to treat

3521

++	 * the requests of each of these processes in about the same

3522

++	 * way as the requests of the others, and thus to provide

3523

++	 * each of these processes with about the same throughput

3524

++	 * (which is exactly the desired throughput distribution). In

3525

++	 * contrast, in any asymmetric scenario, device idling is

3526

++	 * certainly needed to guarantee that bfqq receives its

3527

++	 * assigned fraction of the device throughput (see [1] for

3528

++	 * details).

3529

++	 *

3530

++	 * We address this issue by controlling, actually, only the

3531

++	 * symmetry sub-condition (i), i.e., provided that

3532

++	 * sub-condition (i) holds, idling is not performed,

3533

++	 * regardless of whether sub-condition (ii) holds. In other

3534

++	 * words, only if sub-condition (i) holds, then idling is

3535

++	 * allowed, and the device tends to be prevented from queueing

3536

++	 * many requests, possibly of several processes. The reason

3537

++	 * for not controlling also sub-condition (ii) is that, first,

3538

++	 * in the case of an HDD, the asymmetry in terms of types of

3539

++	 * I/O patterns is already taken in to account in the above

3540

++	 * sentinel variable

3541

++	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a

3542

++	 * flash-based device, we prefer however to privilege

3543

++	 * throughput (and idling lowers throughput for this type of

3544

++	 * devices), for the following reasons:

3545

++	 * 1) differently from HDDs, the service time of random

3546

++	 *    requests is not orders of magnitudes lower than the service

3547

++	 *    time of sequential requests; thus, even if processes doing

3548

++	 *    sequential I/O get a preferential treatment with respect to

3549

++	 *    others doing random I/O, the consequences are not as

3550

++	 *    dramatic as with HDDs;

3551

++	 * 2) if a process doing random I/O does need strong

3552

++	 *    throughput guarantees, it is hopefully already being

3553

++	 *    weight-raised, or the user is likely to have assigned it a

3554

++	 *    higher weight than the other processes (and thus

3555

++	 *    sub-condition (i) is likely to be false, which triggers

3556

++	 *    idling).

3557

++	 *

3558

++	 * According to the above considerations, the next variable is

3559

++	 * true (only) if sub-condition (i) holds. To compute the

3560

++	 * value of this variable, we not only use the return value of

3561

++	 * the function bfq_symmetric_scenario(), but also check

3562

++	 * whether bfqq is being weight-raised, because

3563

++	 * bfq_symmetric_scenario() does not take into account also

3564

++	 * weight-raised queues (see comments to

3565

++	 * bfq_weights_tree_add()).

3566

++	 *

3567

++	 * As a side note, it is worth considering that the above

3568

++	 * device-idling countermeasures may however fail in the

3569

++	 * following unlucky scenario: if idling is (correctly)

3570

++	 * disabled in a time period during which all symmetry

3571

++	 * sub-conditions hold, and hence the device is allowed to

3572

++	 * enqueue many requests, but at some later point in time some

3573

++	 * sub-condition stops to hold, then it may become impossible

3574

++	 * to let requests be served in the desired order until all

3575

++	 * the requests already queued in the device have been served.

3576

++	 */

3577

++	asymmetric_scenario = bfqq->wr_coeff > 1 ||

3578

++		!bfq_symmetric_scenario(bfqd);

3579

++

3580

++	/*

3581

++	 * Finally, there is a case where maximizing throughput is the

3582

++	 * best choice even if it may cause unfairness toward

3583

++	 * bfqq. Such a case is when bfqq became active in a burst of

3584

++	 * queue activations. Queues that became active during a large

3585

++	 * burst benefit only from throughput, as discussed in the

3586

++	 * comments to bfq_handle_burst. Thus, if bfqq became active

3587

++	 * in a burst and not idling the device maximizes throughput,

3588

++	 * then the device must no be idled, because not idling the

3589

++	 * device provides bfqq and all other queues in the burst with

3590

++	 * maximum benefit. Combining this and the two cases above, we

3591

++	 * can now establish when idling is actually needed to

3592

++	 * preserve service guarantees.

3593

++	 */

3594

++	idling_needed_for_service_guarantees =

3595

++		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&

3596

++		!bfq_bfqq_in_large_burst(bfqq);

3597

++

3598

++	/*

3599

++	 * We have now all the components we need to compute the return

3600

++	 * value of the function, which is true only if both the following

3601

++	 * conditions hold:

3602

++	 * 1) bfqq is sync, because idling make sense only for sync queues;

3603

++	 * 2) idling either boosts the throughput (without issues), or

3604

++	 *    is necessary to preserve service guarantees.

3605

++	 */

3606

++	return bfq_bfqq_sync(bfqq) &&

3607

++		(idling_boosts_thr_without_issues ||

3608

++		 idling_needed_for_service_guarantees);

3609

++}

3610

++

3611

++/*

3612

++ * If the in-service queue is empty but the function bfq_bfqq_may_idle

3613

++ * returns true, then:

3614

++ * 1) the queue must remain in service and cannot be expired, and

3615

++ * 2) the device must be idled to wait for the possible arrival of a new

3616

++ *    request for the queue.

3617

++ * See the comments to the function bfq_bfqq_may_idle for the reasons

3618

++ * why performing device idling is the best choice to boost the throughput

3619

++ * and preserve service guarantees when bfq_bfqq_may_idle itself

3620

++ * returns true.

3621

++ */

3622

++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3623

++{

3624

++	struct bfq_data *bfqd = bfqq->bfqd;

3625

++

3626

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3627

++	       bfq_bfqq_may_idle(bfqq);

3628

++}

3629

++

3630

++/*

3631

++ * Select a queue for service.  If we have a current queue in service,

3632

++ * check whether to continue servicing it, or retrieve and set a new one.

3633

++ */

3634

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3635

++{

3636

++	struct bfq_queue *bfqq;

3637

++	struct request *next_rq;

3638

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3639

++

3640

++	bfqq = bfqd->in_service_queue;

3641

++	if (!bfqq)

3642

++		goto new_queue;

3643

++

3644

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3645

++

3646

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3647

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3648

++	    !bfq_bfqq_must_idle(bfqq))

3649

++		goto expire;

3650

++

3651

++	next_rq = bfqq->next_rq;

3652

++	/*

3653

++	 * If bfqq has requests queued and it has enough budget left to

3654

++	 * serve them, keep the queue, otherwise expire it.

3655

++	 */

3656

++	if (next_rq) {

3657

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3658

++			bfq_bfqq_budget_left(bfqq)) {

3659

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3660

++			goto expire;

3661

++		} else {

3662

++			/*

3663

++			 * The idle timer may be pending because we may

3664

++			 * not disable disk idling even when a new request

3665

++			 * arrives.

3666

++			 */

3667

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3668

++				/*

3669

++				 * If we get here: 1) at least a new request

3670

++				 * has arrived but we have not disabled the

3671

++				 * timer because the request was too small,

3672

++				 * 2) then the block layer has unplugged

3673

++				 * the device, causing the dispatch to be

3674

++				 * invoked.

3675

++				 *

3676

++				 * Since the device is unplugged, now the

3677

++				 * requests are probably large enough to

3678

++				 * provide a reasonable throughput.

3679

++				 * So we disable idling.

3680

++				 */

3681

++				bfq_clear_bfqq_wait_request(bfqq);

3682

++				del_timer(&bfqd->idle_slice_timer);

3683

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3684

++				bfqg_stats_update_idle_time(bfqq_group(bfqq));

3685

++#endif

3686

++			}

3687

++			goto keep_queue;

3688

++		}

3689

++	}

3690

++

3691

++	/*

3692

++	 * No requests pending. However, if the in-service queue is idling

3693

++	 * for a new request, or has requests waiting for a completion and

3694

++	 * may idle after their completion, then keep it anyway.

3695

++	 */

3696

++	if (timer_pending(&bfqd->idle_slice_timer) ||

3697

++	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {

3698

++		bfqq = NULL;

3699

++		goto keep_queue;

3700

++	}

3701

++

3702

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3703

++expire:

3704

++	bfq_bfqq_expire(bfqd, bfqq, false, reason);

3705

++new_queue:

3706

++	bfqq = bfq_set_in_service_queue(bfqd);

3707

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3708

++		bfqq ? bfqq->pid : 0);

3709

++keep_queue:

3710

++	return bfqq;

3711

++}

3712

++

3713

++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3714

++{

3715

++	struct bfq_entity *entity = &bfqq->entity;

3716

++	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */

3717

++		bfq_log_bfqq(bfqd, bfqq,

3718

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

3719

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

3720

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

3721

++			bfqq->wr_coeff,

3722

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3723

++

3724

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3725

++		       entity->orig_weight * bfqq->wr_coeff);

3726

++		if (entity->prio_changed)

3727

++			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

3728

++

3729

++		/*

3730

++		 * If the queue was activated in a burst, or

3731

++		 * too much time has elapsed from the beginning

3732

++		 * of this weight-raising period, then end weight

3733

++		 * raising.

3734

++		 */

3735

++		if (bfq_bfqq_in_large_burst(bfqq) ||

3736

++		    time_is_before_jiffies(bfqq->last_wr_start_finish +

3737

++					   bfqq->wr_cur_max_time)) {

3738

++			bfqq->last_wr_start_finish = jiffies;

3739

++			bfq_log_bfqq(bfqd, bfqq,

3740

++				     "wrais ending at %lu, rais_max_time %u",

3741

++				     bfqq->last_wr_start_finish,

3742

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

3743

++			bfq_bfqq_end_wr(bfqq);

3744

++		}

3745

++	}

3746

++	/* Update weight both if it must be raised and if it must be lowered */

3747

++	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))

3748

++		__bfq_entity_update_weight_prio(

3749

++			bfq_entity_service_tree(entity),

3750

++			entity);

3751

++}

3752

++

3753

++/*

3754

++ * Dispatch one request from bfqq, moving it to the request queue

3755

++ * dispatch list.

3756

++ */

3757

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3758

++				struct bfq_queue *bfqq)

3759

++{

3760

++	int dispatched = 0;

3761

++	struct request *rq;

3762

++	unsigned long service_to_charge;

3763

++

3764

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3765

++

3766

++	/* Follow expired path, else get first next available. */

3767

++	rq = bfq_check_fifo(bfqq);

3768

++	if (!rq)

3769

++		rq = bfqq->next_rq;

3770

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3771

++

3772

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3773

++		/*

3774

++		 * This may happen if the next rq is chosen in fifo order

3775

++		 * instead of sector order. The budget is properly

3776

++		 * dimensioned to be always sufficient to serve the next

3777

++		 * request only if it is chosen in sector order. The reason

3778

++		 * is that it would be quite inefficient and little useful

3779

++		 * to always make sure that the budget is large enough to

3780

++		 * serve even the possible next rq in fifo order.

3781

++		 * In fact, requests are seldom served in fifo order.

3782

++		 *

3783

++		 * Expire the queue for budget exhaustion, and make sure

3784

++		 * that the next act_budget is enough to serve the next

3785

++		 * request, even if it comes from the fifo expired path.

3786

++		 */

3787

++		bfqq->next_rq = rq;

3788

++		/*

3789

++		 * Since this dispatch is failed, make sure that

3790

++		 * a new one will be performed

3791

++		 */

3792

++		if (!bfqd->rq_in_driver)

3793

++			bfq_schedule_dispatch(bfqd);

3794

++		goto expire;

3795

++	}

3796

++

3797

++	/* Finally, insert request into driver dispatch list. */

3798

++	bfq_bfqq_served(bfqq, service_to_charge);

3799

++	bfq_dispatch_insert(bfqd->queue, rq);

3800

++

3801

++	bfq_update_wr_data(bfqd, bfqq);

3802

++

3803

++	bfq_log_bfqq(bfqd, bfqq,

3804

++			"dispatched %u sec req (%llu), budg left %d",

3805

++			blk_rq_sectors(rq),

3806

++			(long long unsigned)blk_rq_pos(rq),

3807

++			bfq_bfqq_budget_left(bfqq));

3808

++

3809

++	dispatched++;

3810

++

3811

++	if (!bfqd->in_service_bic) {

3812

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3813

++		bfqd->in_service_bic = RQ_BIC(rq);

3814

++	}

3815

++

3816

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3817

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3818

++	    bfq_class_idle(bfqq)))

3819

++		goto expire;

3820

++

3821

++	return dispatched;

3822

++

3823

++expire:

3824

++	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);

3825

++	return dispatched;

3826

++}

3827

++

3828

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3829

++{

3830

++	int dispatched = 0;

3831

++

3832

++	while (bfqq->next_rq) {

3833

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3834

++		dispatched++;

3835

++	}

3836

++

3837

++	BUG_ON(!list_empty(&bfqq->fifo));

3838

++	return dispatched;

3839

++}

3840

++

3841

++/*

3842

++ * Drain our current requests.

3843

++ * Used for barriers and when switching io schedulers on-the-fly.

3844

++ */

3845

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3846

++{

3847

++	struct bfq_queue *bfqq, *n;

3848

++	struct bfq_service_tree *st;

3849

++	int dispatched = 0;

3850

++

3851

++	bfqq = bfqd->in_service_queue;

3852

++	if (bfqq)

3853

++		__bfq_bfqq_expire(bfqd, bfqq);

3854

++

3855

++	/*

3856

++	 * Loop through classes, and be careful to leave the scheduler

3857

++	 * in a consistent state, as feedback mechanisms and vtime

3858

++	 * updates cannot be disabled during the process.

3859

++	 */

3860

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3861

++		st = bfq_entity_service_tree(&bfqq->entity);

3862

++

3863

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3864

++		bfqq->max_budget = bfq_max_budget(bfqd);

3865

++

3866

++		bfq_forget_idle(st);

3867

++	}

3868

++

3869

++	BUG_ON(bfqd->busy_queues != 0);

3870

++

3871

++	return dispatched;

3872

++}

3873

++

3874

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3875

++{

3876

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3877

++	struct bfq_queue *bfqq;

3878

++	int max_dispatch;

3879

++

3880

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3881

++	if (bfqd->busy_queues == 0)

3882

++		return 0;

3883

++

3884

++	if (unlikely(force))

3885

++		return bfq_forced_dispatch(bfqd);

3886

++

3887

++	bfqq = bfq_select_queue(bfqd);

3888

++	if (!bfqq)

3889

++		return 0;

3890

++

3891

++	if (bfq_class_idle(bfqq))

3892

++		max_dispatch = 1;

3893

++

3894

++	if (!bfq_bfqq_sync(bfqq))

3895

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3896

++

3897

++	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {

3898

++		if (bfqd->busy_queues > 1)

3899

++			return 0;

3900

++		if (bfqq->dispatched >= 4 * max_dispatch)

3901

++			return 0;

3902

++	}

3903

++

3904

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3905

++		return 0;

3906

++

3907

++	bfq_clear_bfqq_wait_request(bfqq);

3908

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3909

++

3910

++	if (!bfq_dispatch_request(bfqd, bfqq))

3911

++		return 0;

3912

++

3913

++	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",

3914

++			bfq_bfqq_sync(bfqq) ? "sync" : "async");

3915

++

3916

++	return 1;

3917

++}

3918

++

3919

++/*

3920

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3921

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3922

++ *

3923

++ * Queue lock must be held here.

3924

++ */

3925

++static void bfq_put_queue(struct bfq_queue *bfqq)

3926

++{

3927

++	struct bfq_data *bfqd = bfqq->bfqd;

3928

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3929

++	struct bfq_group *bfqg = bfqq_group(bfqq);

3930

++#endif

3931

++

3932

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3933

++

3934

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3935

++		     atomic_read(&bfqq->ref));

3936

++	if (!atomic_dec_and_test(&bfqq->ref))

3937

++		return;

3938

++

3939

++	BUG_ON(rb_first(&bfqq->sort_list));

3940

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3941

++	BUG_ON(bfqq->entity.tree);

3942

++	BUG_ON(bfq_bfqq_busy(bfqq));

3943

++	BUG_ON(bfqd->in_service_queue == bfqq);

3944

++

3945

++	if (bfq_bfqq_sync(bfqq))

3946

++		/*

3947

++		 * The fact that this queue is being destroyed does not

3948

++		 * invalidate the fact that this queue may have been

3949

++		 * activated during the current burst. As a consequence,

3950

++		 * although the queue does not exist anymore, and hence

3951

++		 * needs to be removed from the burst list if there,

3952

++		 * the burst size has not to be decremented.

3953

++		 */

3954

++		hlist_del_init(&bfqq->burst_list_node);

3955

++

3956

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3957

++

3958

++	kmem_cache_free(bfq_pool, bfqq);

3959

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3960

++	bfqg_put(bfqg);

3961

++#endif

3962

++}

3963

++

3964

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3965

++{

3966

++	if (bfqq == bfqd->in_service_queue) {

3967

++		__bfq_bfqq_expire(bfqd, bfqq);

3968

++		bfq_schedule_dispatch(bfqd);

3969

++	}

3970

++

3971

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3972

++		     atomic_read(&bfqq->ref));

3973

++

3974

++	bfq_put_queue(bfqq);

3975

++}

3976

++

3977

++static void bfq_init_icq(struct io_cq *icq)

3978

++{

3979

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3980

++

3981

++	bic->ttime.last_end_request = jiffies;

3982

++}

3983

++

3984

++static void bfq_exit_icq(struct io_cq *icq)

3985

++{

3986

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3987

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3988

++

3989

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3990

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3991

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3992

++	}

3993

++

3994

++	if (bic->bfqq[BLK_RW_SYNC]) {

3995

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3996

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3997

++	}

3998

++}

3999

++

4000

++/*

4001

++ * Update the entity prio values; note that the new values will not

4002

++ * be used until the next (re)activation.

4003

++ */

4004

++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

4005

++{

4006

++	struct task_struct *tsk = current;

4007

++	int ioprio_class;

4008

++

4009

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4010

++	switch (ioprio_class) {

4011

++	default:

4012

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

4013

++			"bfq: bad prio class %d\n", ioprio_class);

4014

++	case IOPRIO_CLASS_NONE:

4015

++		/*

4016

++		 * No prio set, inherit CPU scheduling settings.

4017

++		 */

4018

++		bfqq->new_ioprio = task_nice_ioprio(tsk);

4019

++		bfqq->new_ioprio_class = task_nice_ioclass(tsk);

4020

++		break;

4021

++	case IOPRIO_CLASS_RT:

4022

++		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4023

++		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;

4024

++		break;

4025

++	case IOPRIO_CLASS_BE:

4026

++		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4027

++		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;

4028

++		break;

4029

++	case IOPRIO_CLASS_IDLE:

4030

++		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;

4031

++		bfqq->new_ioprio = 7;

4032

++		bfq_clear_bfqq_idle_window(bfqq);

4033

++		break;

4034

++	}

4035

++

4036

++	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {

4037

++		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",

4038

++				 bfqq->new_ioprio);

4039

++		BUG();

4040

++	}

4041

++

4042

++	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);

4043

++	bfqq->entity.prio_changed = 1;

4044

++}

4045

++

4046

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)

4047

++{

4048

++	struct bfq_data *bfqd;

4049

++	struct bfq_queue *bfqq, *new_bfqq;

4050

++	unsigned long uninitialized_var(flags);

4051

++	int ioprio = bic->icq.ioc->ioprio;

4052

++

4053

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

4054

++				   &flags);

4055

++	/*

4056

++	 * This condition may trigger on a newly created bic, be sure to

4057

++	 * drop the lock before returning.

4058

++	 */

4059

++	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))

4060

++		goto out;

4061

++

4062

++	bic->ioprio = ioprio;

4063

++

4064

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

4065

++	if (bfqq) {

4066

++		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,

4067

++					 GFP_ATOMIC);

4068

++		if (new_bfqq) {

4069

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

4070

++			bfq_log_bfqq(bfqd, bfqq,

4071

++				     "check_ioprio_change: bfqq %p %d",

4072

++				     bfqq, atomic_read(&bfqq->ref));

4073

++			bfq_put_queue(bfqq);

4074

++		}

4075

++	}

4076

++

4077

++	bfqq = bic->bfqq[BLK_RW_SYNC];

4078

++	if (bfqq)

4079

++		bfq_set_next_ioprio_data(bfqq, bic);

4080

++

4081

++out:

4082

++	bfq_put_bfqd_unlock(bfqd, &flags);

4083

++}

4084

++

4085

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4086

++			  struct bfq_io_cq *bic, pid_t pid, int is_sync)

4087

++{

4088

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

4089

++	INIT_LIST_HEAD(&bfqq->fifo);

4090

++	INIT_HLIST_NODE(&bfqq->burst_list_node);

4091

++

4092

++	atomic_set(&bfqq->ref, 0);

4093

++	bfqq->bfqd = bfqd;

4094

++

4095

++	if (bic)

4096

++		bfq_set_next_ioprio_data(bfqq, bic);

4097

++

4098

++	if (is_sync) {

4099

++		if (!bfq_class_idle(bfqq))

4100

++			bfq_mark_bfqq_idle_window(bfqq);

4101

++		bfq_mark_bfqq_sync(bfqq);

4102

++	} else

4103

++		bfq_clear_bfqq_sync(bfqq);

4104

++	bfq_mark_bfqq_IO_bound(bfqq);

4105

++

4106

++	/* Tentative initial value to trade off between thr and lat */

4107

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

4108

++	bfqq->pid = pid;

4109

++

4110

++	bfqq->wr_coeff = 1;

4111

++	bfqq->last_wr_start_finish = 0;

4112

++	/*

4113

++	 * Set to the value for which bfqq will not be deemed as

4114

++	 * soft rt when it becomes backlogged.

4115

++	 */

4116

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

4117

++}

4118

++

4119

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

4120

++					      struct bio *bio, int is_sync,

4121

++					      struct bfq_io_cq *bic,

4122

++					      gfp_t gfp_mask)

4123

++{

4124

++	struct bfq_group *bfqg;

4125

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

4126

++	struct blkcg *blkcg;

4127

++

4128

++retry:

4129

++	rcu_read_lock();

4130

++

4131

++	blkcg = bio_blkcg(bio);

4132

++	bfqg = bfq_find_alloc_group(bfqd, blkcg);

4133

++	/* bic always exists here */

4134

++	bfqq = bic_to_bfqq(bic, is_sync);

4135

++

4136

++	/*

4137

++	 * Always try a new alloc if we fall back to the OOM bfqq

4138

++	 * originally, since it should just be a temporary situation.

4139

++	 */

4140

++	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

4141

++		bfqq = NULL;

4142

++		if (new_bfqq) {

4143

++			bfqq = new_bfqq;

4144

++			new_bfqq = NULL;

4145

++		} else if (gfpflags_allow_blocking(gfp_mask)) {

4146

++			rcu_read_unlock();

4147

++			spin_unlock_irq(bfqd->queue->queue_lock);

4148

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

4149

++					gfp_mask | __GFP_ZERO,

4150

++					bfqd->queue->node);

4151

++			spin_lock_irq(bfqd->queue->queue_lock);

4152

++			if (new_bfqq)

4153

++				goto retry;

4154

++		} else {

4155

++			bfqq = kmem_cache_alloc_node(bfq_pool,

4156

++					gfp_mask | __GFP_ZERO,

4157

++					bfqd->queue->node);

4158

++		}

4159

++

4160

++		if (bfqq) {

4161

++			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

4162

++                                      is_sync);

4163

++			bfq_init_entity(&bfqq->entity, bfqg);

4164

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

4165

++		} else {

4166

++			bfqq = &bfqd->oom_bfqq;

4167

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

4168

++		}

4169

++	}

4170

++

4171

++	if (new_bfqq)

4172

++		kmem_cache_free(bfq_pool, new_bfqq);

4173

++

4174

++	rcu_read_unlock();

4175

++

4176

++	return bfqq;

4177

++}

4178

++

4179

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

4180

++					       struct bfq_group *bfqg,

4181

++					       int ioprio_class, int ioprio)

4182

++{

4183

++	switch (ioprio_class) {

4184

++	case IOPRIO_CLASS_RT:

4185

++		return &bfqg->async_bfqq[0][ioprio];

4186

++	case IOPRIO_CLASS_NONE:

4187

++		ioprio = IOPRIO_NORM;

4188

++		/* fall through */

4189

++	case IOPRIO_CLASS_BE:

4190

++		return &bfqg->async_bfqq[1][ioprio];

4191

++	case IOPRIO_CLASS_IDLE:

4192

++		return &bfqg->async_idle_bfqq;

4193

++	default:

4194

++		BUG();

4195

++	}

4196

++}

4197

++

4198

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

4199

++				       struct bio *bio, int is_sync,

4200

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

4201

++{

4202

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4203

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4204

++	struct bfq_queue **async_bfqq = NULL;

4205

++	struct bfq_queue *bfqq = NULL;

4206

++

4207

++	if (!is_sync) {

4208

++		struct blkcg *blkcg;

4209

++		struct bfq_group *bfqg;

4210

++

4211

++		rcu_read_lock();

4212

++		blkcg = bio_blkcg(bio);

4213

++		rcu_read_unlock();

4214

++		bfqg = bfq_find_alloc_group(bfqd, blkcg);

4215

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

4216

++						  ioprio);

4217

++		bfqq = *async_bfqq;

4218

++	}

4219

++

4220

++	if (!bfqq)

4221

++		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);

4222

++

4223

++	/*

4224

++	 * Pin the queue now that it's allocated, scheduler exit will

4225

++	 * prune it.

4226

++	 */

4227

++	if (!is_sync && !(*async_bfqq)) {

4228

++		atomic_inc(&bfqq->ref);

4229

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

4230

++			     bfqq, atomic_read(&bfqq->ref));

4231

++		*async_bfqq = bfqq;

4232

++	}

4233

++

4234

++	atomic_inc(&bfqq->ref);

4235

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

4236

++		     atomic_read(&bfqq->ref));

4237

++	return bfqq;

4238

++}

4239

++

4240

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

4241

++				    struct bfq_io_cq *bic)

4242

++{

4243

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

4244

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

4245

++

4246

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

4247

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

4248

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

4249

++				bic->ttime.ttime_samples;

4250

++}

4251

++

4252

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

4253

++				   struct bfq_queue *bfqq,

4254

++				   struct request *rq)

4255

++{

4256

++	sector_t sdist;

4257

++	u64 total;

4258

++

4259

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

4260

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

4261

++	else

4262

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

4263

++

4264

++	/*

4265

++	 * Don't allow the seek distance to get too large from the

4266

++	 * odd fragment, pagein, etc.

4267

++	 */

4268

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

4269

++		sdist = 0;

4270

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

4271

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

4272

++	else

4273

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

4274

++

4275

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

4276

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

4277

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

4278

++	do_div(total, bfqq->seek_samples);

4279

++	bfqq->seek_mean = (sector_t)total;

4280

++

4281

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

4282

++			(u64)bfqq->seek_mean);

4283

++}

4284

++

4285

++/*

4286

++ * Disable idle window if the process thinks too long or seeks so much that

4287

++ * it doesn't matter.

4288

++ */

4289

++static void bfq_update_idle_window(struct bfq_data *bfqd,

4290

++				   struct bfq_queue *bfqq,

4291

++				   struct bfq_io_cq *bic)

4292

++{

4293

++	int enable_idle;

4294

++

4295

++	/* Don't idle for async or idle io prio class. */

4296

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

4297

++		return;

4298

++

4299

++	enable_idle = bfq_bfqq_idle_window(bfqq);

4300

++

4301

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

4302

++	    bfqd->bfq_slice_idle == 0 ||

4303

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

4304

++			bfqq->wr_coeff == 1))

4305

++		enable_idle = 0;

4306

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

4307

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

4308

++			bfqq->wr_coeff == 1)

4309

++			enable_idle = 0;

4310

++		else

4311

++			enable_idle = 1;

4312

++	}

4313

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

4314

++		enable_idle);

4315

++

4316

++	if (enable_idle)

4317

++		bfq_mark_bfqq_idle_window(bfqq);

4318

++	else

4319

++		bfq_clear_bfqq_idle_window(bfqq);

4320

++}

4321

++

4322

++/*

4323

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

4324

++ * something we should do about it.

4325

++ */

4326

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4327

++			    struct request *rq)

4328

++{

4329

++	struct bfq_io_cq *bic = RQ_BIC(rq);

4330

++

4331

++	if (rq->cmd_flags & REQ_META)

4332

++		bfqq->meta_pending++;

4333

++

4334

++	bfq_update_io_thinktime(bfqd, bic);

4335

++	bfq_update_io_seektime(bfqd, bfqq, rq);

4336

++	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

4337

++		bfq_clear_bfqq_constantly_seeky(bfqq);

4338

++		if (!blk_queue_nonrot(bfqd->queue)) {

4339

++			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

4340

++			bfqd->const_seeky_busy_in_flight_queues--;

4341

++		}

4342

++	}

4343

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

4344

++	    !BFQQ_SEEKY(bfqq))

4345

++		bfq_update_idle_window(bfqd, bfqq, bic);

4346

++

4347

++	bfq_log_bfqq(bfqd, bfqq,

4348

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

4349

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

4350

++		     (long long unsigned)bfqq->seek_mean);

4351

++

4352

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

4353

++

4354

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

4355

++		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

4356

++				 blk_rq_sectors(rq) < 32;

4357

++		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);

4358

++

4359

++		/*

4360

++		 * There is just this request queued: if the request

4361

++		 * is small and the queue is not to be expired, then

4362

++		 * just exit.

4363

++		 *

4364

++		 * In this way, if the disk is being idled to wait for

4365

++		 * a new request from the in-service queue, we avoid

4366

++		 * unplugging the device and committing the disk to serve

4367

++		 * just a small request. On the contrary, we wait for

4368

++		 * the block layer to decide when to unplug the device:

4369

++		 * hopefully, new requests will be merged to this one

4370

++		 * quickly, then the device will be unplugged and

4371

++		 * larger requests will be dispatched.

4372

++		 */

4373

++		if (small_req && !budget_timeout)

4374

++			return;

4375

++

4376

++		/*

4377

++		 * A large enough request arrived, or the queue is to

4378

++		 * be expired: in both cases disk idling is to be

4379

++		 * stopped, so clear wait_request flag and reset

4380

++		 * timer.

4381

++		 */

4382

++		bfq_clear_bfqq_wait_request(bfqq);

4383

++		del_timer(&bfqd->idle_slice_timer);

4384

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4385

++		bfqg_stats_update_idle_time(bfqq_group(bfqq));

4386

++#endif

4387

++

4388

++		/*

4389

++		 * The queue is not empty, because a new request just

4390

++		 * arrived. Hence we can safely expire the queue, in

4391

++		 * case of budget timeout, without risking that the

4392

++		 * timestamps of the queue are not updated correctly.

4393

++		 * See [1] for more details.

4394

++		 */

4395

++		if (budget_timeout)

4396

++			bfq_bfqq_expire(bfqd, bfqq, false,

4397

++					BFQ_BFQQ_BUDGET_TIMEOUT);

4398

++

4399

++		/*

4400

++		 * Let the request rip immediately, or let a new queue be

4401

++		 * selected if bfqq has just been expired.

4402

++		 */

4403

++		__blk_run_queue(bfqd->queue);

4404

++	}

4405

++}

4406

++

4407

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

4408

++{

4409

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4410

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4411

++

4412

++	assert_spin_locked(bfqd->queue->queue_lock);

4413

++

4414

++	bfq_add_request(rq);

4415

++

4416

++	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

4417

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

4418

++

4419

++	bfq_rq_enqueued(bfqd, bfqq, rq);

4420

++}

4421

++

4422

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

4423

++{

4424

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

4425

++				     bfqd->rq_in_driver);

4426

++

4427

++	if (bfqd->hw_tag == 1)

4428

++		return;

4429

++

4430

++	/*

4431

++	 * This sample is valid if the number of outstanding requests

4432

++	 * is large enough to allow a queueing behavior.  Note that the

4433

++	 * sum is not exact, as it's not taking into account deactivated

4434

++	 * requests.

4435

++	 */

4436

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

4437

++		return;

4438

++

4439

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

4440

++		return;

4441

++

4442

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

4443

++	bfqd->max_rq_in_driver = 0;

4444

++	bfqd->hw_tag_samples = 0;

4445

++}

4446

++

4447

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

4448

++{

4449

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4450

++	struct bfq_data *bfqd = bfqq->bfqd;

4451

++	bool sync = bfq_bfqq_sync(bfqq);

4452

++

4453

++	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

4454

++		     blk_rq_sectors(rq), sync);

4455

++

4456

++	bfq_update_hw_tag(bfqd);

4457

++

4458

++	BUG_ON(!bfqd->rq_in_driver);

4459

++	BUG_ON(!bfqq->dispatched);

4460

++	bfqd->rq_in_driver--;

4461

++	bfqq->dispatched--;

4462

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4463

++	bfqg_stats_update_completion(bfqq_group(bfqq),

4464

++				     rq_start_time_ns(rq),

4465

++				     rq_io_start_time_ns(rq), rq->cmd_flags);

4466

++#endif

4467

++

4468

++	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

4469

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

4470

++					&bfqd->queue_weights_tree);

4471

++		if (!blk_queue_nonrot(bfqd->queue)) {

4472

++			BUG_ON(!bfqd->busy_in_flight_queues);

4473

++			bfqd->busy_in_flight_queues--;

4474

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

4475

++				BUG_ON(!bfqd->

4476

++					const_seeky_busy_in_flight_queues);

4477

++				bfqd->const_seeky_busy_in_flight_queues--;

4478

++			}

4479

++		}

4480

++	}

4481

++

4482

++	if (sync) {

4483

++		bfqd->sync_flight--;

4484

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

4485

++	}

4486

++

4487

++	/*

4488

++	 * If we are waiting to discover whether the request pattern of the

4489

++	 * task associated with the queue is actually isochronous, and

4490

++	 * both requisites for this condition to hold are satisfied, then

4491

++	 * compute soft_rt_next_start (see the comments to the function

4492

++	 * bfq_bfqq_softrt_next_start()).

4493

++	 */

4494

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

4495

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

4496

++		bfqq->soft_rt_next_start =

4497

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

4498

++

4499

++	/*

4500

++	 * If this is the in-service queue, check if it needs to be expired,

4501

++	 * or if we want to idle in case it has no pending requests.

4502

++	 */

4503

++	if (bfqd->in_service_queue == bfqq) {

4504

++		if (bfq_bfqq_budget_new(bfqq))

4505

++			bfq_set_budget_timeout(bfqd);

4506

++

4507

++		if (bfq_bfqq_must_idle(bfqq)) {

4508

++			bfq_arm_slice_timer(bfqd);

4509

++			goto out;

4510

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

4511

++			bfq_bfqq_expire(bfqd, bfqq, false,

4512

++					BFQ_BFQQ_BUDGET_TIMEOUT);

4513

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

4514

++			 (bfqq->dispatched == 0 ||

4515

++			  !bfq_bfqq_may_idle(bfqq)))

4516

++			bfq_bfqq_expire(bfqd, bfqq, false,

4517

++					BFQ_BFQQ_NO_MORE_REQUESTS);

4518

++	}

4519

++

4520

++	if (!bfqd->rq_in_driver)

4521

++		bfq_schedule_dispatch(bfqd);

4522

++

4523

++out:

4524

++	return;

4525

++}

4526

++

4527

++static int __bfq_may_queue(struct bfq_queue *bfqq)

4528

++{

4529

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

4530

++		bfq_clear_bfqq_must_alloc(bfqq);

4531

++		return ELV_MQUEUE_MUST;

4532

++	}

4533

++

4534

++	return ELV_MQUEUE_MAY;

4535

++}

4536

++

4537

++static int bfq_may_queue(struct request_queue *q, int rw)

4538

++{

4539

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4540

++	struct task_struct *tsk = current;

4541

++	struct bfq_io_cq *bic;

4542

++	struct bfq_queue *bfqq;

4543

++

4544

++	/*

4545

++	 * Don't force setup of a queue from here, as a call to may_queue

4546

++	 * does not necessarily imply that a request actually will be

4547

++	 * queued. So just lookup a possibly existing queue, or return

4548

++	 * 'may queue' if that fails.

4549

++	 */

4550

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

4551

++	if (!bic)

4552

++		return ELV_MQUEUE_MAY;

4553

++

4554

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

4555

++	if (bfqq)

4556

++		return __bfq_may_queue(bfqq);

4557

++

4558

++	return ELV_MQUEUE_MAY;

4559

++}

4560

++

4561

++/*

4562

++ * Queue lock held here.

4563

++ */

4564

++static void bfq_put_request(struct request *rq)

4565

++{

4566

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4567

++

4568

++	if (bfqq) {

4569

++		const int rw = rq_data_dir(rq);

4570

++

4571

++		BUG_ON(!bfqq->allocated[rw]);

4572

++		bfqq->allocated[rw]--;

4573

++

4574

++		rq->elv.priv[0] = NULL;

4575

++		rq->elv.priv[1] = NULL;

4576

++

4577

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

4578

++			     bfqq, atomic_read(&bfqq->ref));

4579

++		bfq_put_queue(bfqq);

4580

++	}

4581

++}

4582

++

4583

++/*

4584

++ * Allocate bfq data structures associated with this request.

4585

++ */

4586

++static int bfq_set_request(struct request_queue *q, struct request *rq,

4587

++			   struct bio *bio, gfp_t gfp_mask)

4588

++{

4589

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4590

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

4591

++	const int rw = rq_data_dir(rq);

4592

++	const int is_sync = rq_is_sync(rq);

4593

++	struct bfq_queue *bfqq;

4594

++	unsigned long flags;

4595

++

4596

++	might_sleep_if(gfpflags_allow_blocking(gfp_mask));

4597

++

4598

++	bfq_check_ioprio_change(bic, bio);

4599

++

4600

++	spin_lock_irqsave(q->queue_lock, flags);

4601

++

4602

++	if (!bic)

4603

++		goto queue_fail;

4604

++

4605

++	bfq_bic_update_cgroup(bic, bio);

4606

++

4607

++	bfqq = bic_to_bfqq(bic, is_sync);

4608

++	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

4609

++		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);

4610

++		bic_set_bfqq(bic, bfqq, is_sync);

4611

++		if (is_sync) {

4612

++			if (bfqd->large_burst)

4613

++				bfq_mark_bfqq_in_large_burst(bfqq);

4614

++			else

4615

++				bfq_clear_bfqq_in_large_burst(bfqq);

4616

++		}

4617

++	}

4618

++

4619

++	bfqq->allocated[rw]++;

4620

++	atomic_inc(&bfqq->ref);

4621

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4622

++		     atomic_read(&bfqq->ref));

4623

++

4624

++	rq->elv.priv[0] = bic;

4625

++	rq->elv.priv[1] = bfqq;

4626

++

4627

++	spin_unlock_irqrestore(q->queue_lock, flags);

4628

++

4629

++	return 0;

4630

++

4631

++queue_fail:

4632

++	bfq_schedule_dispatch(bfqd);

4633

++	spin_unlock_irqrestore(q->queue_lock, flags);

4634

++

4635

++	return 1;

4636

++}

4637

++

4638

++static void bfq_kick_queue(struct work_struct *work)

4639

++{

4640

++	struct bfq_data *bfqd =

4641

++		container_of(work, struct bfq_data, unplug_work);

4642

++	struct request_queue *q = bfqd->queue;

4643

++

4644

++	spin_lock_irq(q->queue_lock);

4645

++	__blk_run_queue(q);

4646

++	spin_unlock_irq(q->queue_lock);

4647

++}

4648

++

4649

++/*

4650

++ * Handler of the expiration of the timer running if the in-service queue

4651

++ * is idling inside its time slice.

4652

++ */

4653

++static void bfq_idle_slice_timer(unsigned long data)

4654

++{

4655

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4656

++	struct bfq_queue *bfqq;

4657

++	unsigned long flags;

4658

++	enum bfqq_expiration reason;

4659

++

4660

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4661

++

4662

++	bfqq = bfqd->in_service_queue;

4663

++	/*

4664

++	 * Theoretical race here: the in-service queue can be NULL or

4665

++	 * different from the queue that was idling if the timer handler

4666

++	 * spins on the queue_lock and a new request arrives for the

4667

++	 * current queue and there is a full dispatch cycle that changes

4668

++	 * the in-service queue.  This can hardly happen, but in the worst

4669

++	 * case we just expire a queue too early.

4670

++	 */

4671

++	if (bfqq) {

4672

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4673

++		if (bfq_bfqq_budget_timeout(bfqq))

4674

++			/*

4675

++			 * Also here the queue can be safely expired

4676

++			 * for budget timeout without wasting

4677

++			 * guarantees

4678

++			 */

4679

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4680

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4681

++			/*

4682

++			 * The queue may not be empty upon timer expiration,

4683

++			 * because we may not disable the timer when the

4684

++			 * first request of the in-service queue arrives

4685

++			 * during disk idling.

4686

++			 */

4687

++			reason = BFQ_BFQQ_TOO_IDLE;

4688

++		else

4689

++			goto schedule_dispatch;

4690

++

4691

++		bfq_bfqq_expire(bfqd, bfqq, true, reason);

4692

++	}

4693

++

4694

++schedule_dispatch:

4695

++	bfq_schedule_dispatch(bfqd);

4696

++

4697

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4698

++}

4699

++

4700

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4701

++{

4702

++	del_timer_sync(&bfqd->idle_slice_timer);

4703

++	cancel_work_sync(&bfqd->unplug_work);

4704

++}

4705

++

4706

++static void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4707

++					struct bfq_queue **bfqq_ptr)

4708

++{

4709

++	struct bfq_group *root_group = bfqd->root_group;

4710

++	struct bfq_queue *bfqq = *bfqq_ptr;

4711

++

4712

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4713

++	if (bfqq) {

4714

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4715

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4716

++			     bfqq, atomic_read(&bfqq->ref));

4717

++		bfq_put_queue(bfqq);

4718

++		*bfqq_ptr = NULL;

4719

++	}

4720

++}

4721

++

4722

++/*

4723

++ * Release all the bfqg references to its async queues.  If we are

4724

++ * deallocating the group these queues may still contain requests, so

4725

++ * we reparent them to the root cgroup (i.e., the only one that will

4726

++ * exist for sure until all the requests on a device are gone).

4727

++ */

4728

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4729

++{

4730

++	int i, j;

4731

++

4732

++	for (i = 0; i < 2; i++)

4733

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4734

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4735

++

4736

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4737

++}

4738

++

4739

++static void bfq_exit_queue(struct elevator_queue *e)

4740

++{

4741

++	struct bfq_data *bfqd = e->elevator_data;

4742

++	struct request_queue *q = bfqd->queue;

4743

++	struct bfq_queue *bfqq, *n;

4744

++

4745

++	bfq_shutdown_timer_wq(bfqd);

4746

++

4747

++	spin_lock_irq(q->queue_lock);

4748

++

4749

++	BUG_ON(bfqd->in_service_queue);

4750

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4751

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4752

++

4753

++	spin_unlock_irq(q->queue_lock);

4754

++

4755

++	bfq_shutdown_timer_wq(bfqd);

4756

++

4757

++	synchronize_rcu();

4758

++

4759

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4760

++

4761

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4762

++	blkcg_deactivate_policy(q, &blkcg_policy_bfq);

4763

++#else

4764

++	kfree(bfqd->root_group);

4765

++#endif

4766

++

4767

++	kfree(bfqd);

4768

++}

4769

++

4770

++static void bfq_init_root_group(struct bfq_group *root_group,

4771

++				struct bfq_data *bfqd)

4772

++{

4773

++	int i;

4774

++

4775

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4776

++	root_group->entity.parent = NULL;

4777

++	root_group->my_entity = NULL;

4778

++	root_group->bfqd = bfqd;

4779

++#endif

4780

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

4781

++		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

4782

++}

4783

++

4784

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4785

++{

4786

++	struct bfq_data *bfqd;

4787

++	struct elevator_queue *eq;

4788

++

4789

++	eq = elevator_alloc(q, e);

4790

++	if (!eq)

4791

++		return -ENOMEM;

4792

++

4793

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4794

++	if (!bfqd) {

4795

++		kobject_put(&eq->kobj);

4796

++		return -ENOMEM;

4797

++	}

4798

++	eq->elevator_data = bfqd;

4799

++

4800

++	/*

4801

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4802

++	 * Grab a permanent reference to it, so that the normal code flow

4803

++	 * will not attempt to free it.

4804

++	 */

4805

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);

4806

++	atomic_inc(&bfqd->oom_bfqq.ref);

4807

++	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

4808

++	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;

4809

++	bfqd->oom_bfqq.entity.new_weight =

4810

++		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);

4811

++	/*

4812

++	 * Trigger weight initialization, according to ioprio, at the

4813

++	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

4814

++	 * class won't be changed any more.

4815

++	 */

4816

++	bfqd->oom_bfqq.entity.prio_changed = 1;

4817

++

4818

++	bfqd->queue = q;

4819

++

4820

++	spin_lock_irq(q->queue_lock);

4821

++	q->elevator = eq;

4822

++	spin_unlock_irq(q->queue_lock);

4823

++

4824

++	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);

4825

++	if (!bfqd->root_group)

4826

++		goto out_free;

4827

++	bfq_init_root_group(bfqd->root_group, bfqd);

4828

++	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

4829

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4830

++	bfqd->active_numerous_groups = 0;

4831

++#endif

4832

++

4833

++	init_timer(&bfqd->idle_slice_timer);

4834

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4835

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4836

++

4837

++	bfqd->queue_weights_tree = RB_ROOT;

4838

++	bfqd->group_weights_tree = RB_ROOT;

4839

++

4840

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4841

++

4842

++	INIT_LIST_HEAD(&bfqd->active_list);

4843

++	INIT_LIST_HEAD(&bfqd->idle_list);

4844

++	INIT_HLIST_HEAD(&bfqd->burst_list);

4845

++

4846

++	bfqd->hw_tag = -1;

4847

++

4848

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4849

++

4850

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4851

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4852

++	bfqd->bfq_back_max = bfq_back_max;

4853

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4854

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4855

++	bfqd->bfq_class_idle_last_service = 0;

4856

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4857

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4858

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4859

++

4860

++	bfqd->bfq_requests_within_timer = 120;

4861

++

4862

++	bfqd->bfq_large_burst_thresh = 11;

4863

++	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

4864

++

4865

++	bfqd->low_latency = true;

4866

++

4867

++	bfqd->bfq_wr_coeff = 20;

4868

++	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

4869

++	bfqd->bfq_wr_max_time = 0;

4870

++	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

4871

++	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);

4872

++	bfqd->bfq_wr_max_softrt_rate = 7000; /*

4873

++					      * Approximate rate required

4874

++					      * to playback or record a

4875

++					      * high-definition compressed

4876

++					      * video.

4877

++					      */

4878

++	bfqd->wr_busy_queues = 0;

4879

++	bfqd->busy_in_flight_queues = 0;

4880

++	bfqd->const_seeky_busy_in_flight_queues = 0;

4881

++

4882

++	/*

4883

++	 * Begin by assuming, optimistically, that the device peak rate is

4884

++	 * equal to the highest reference rate.

4885

++	 */

4886

++	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

4887

++			T_fast[blk_queue_nonrot(bfqd->queue)];

4888

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

4889

++	bfqd->device_speed = BFQ_BFQD_FAST;

4890

++

4891

++	return 0;

4892

++

4893

++out_free:

4894

++	kfree(bfqd);

4895

++	kobject_put(&eq->kobj);

4896

++	return -ENOMEM;

4897

++}

4898

++

4899

++static void bfq_slab_kill(void)

4900

++{

4901

++	if (bfq_pool)

4902

++		kmem_cache_destroy(bfq_pool);

4903

++}

4904

++

4905

++static int __init bfq_slab_setup(void)

4906

++{

4907

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4908

++	if (!bfq_pool)

4909

++		return -ENOMEM;

4910

++	return 0;

4911

++}

4912

++

4913

++static ssize_t bfq_var_show(unsigned int var, char *page)

4914

++{

4915

++	return sprintf(page, "%d\n", var);

4916

++}

4917

++

4918

++static ssize_t bfq_var_store(unsigned long *var, const char *page,

4919

++			     size_t count)

4920

++{

4921

++	unsigned long new_val;

4922

++	int ret = kstrtoul(page, 10, &new_val);

4923

++

4924

++	if (ret == 0)

4925

++		*var = new_val;

4926

++

4927

++	return count;

4928

++}

4929

++

4930

++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)

4931

++{

4932

++	struct bfq_data *bfqd = e->elevator_data;

4933

++	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?

4934

++		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :

4935

++		       jiffies_to_msecs(bfq_wr_duration(bfqd)));

4936

++}

4937

++

4938

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4939

++{

4940

++	struct bfq_queue *bfqq;

4941

++	struct bfq_data *bfqd = e->elevator_data;

4942

++	ssize_t num_char = 0;

4943

++

4944

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4945

++			    bfqd->queued);

4946

++

4947

++	spin_lock_irq(bfqd->queue->queue_lock);

4948

++

4949

++	num_char += sprintf(page + num_char, "Active:\n");

4950

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4951

++	  num_char += sprintf(page + num_char,

4952

++			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",

4953

++			      bfqq->pid,

4954

++			      bfqq->entity.weight,

4955

++			      bfqq->queued[0],

4956

++			      bfqq->queued[1],

4957

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

4958

++			jiffies_to_msecs(bfqq->wr_cur_max_time));

4959

++	}

4960

++

4961

++	num_char += sprintf(page + num_char, "Idle:\n");

4962

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4963

++			num_char += sprintf(page + num_char,

4964

++				"pid%d: weight %hu, dur %d/%u\n",

4965

++				bfqq->pid,

4966

++				bfqq->entity.weight,

4967

++				jiffies_to_msecs(jiffies -

4968

++					bfqq->last_wr_start_finish),

4969

++				jiffies_to_msecs(bfqq->wr_cur_max_time));

4970

++	}

4971

++

4972

++	spin_unlock_irq(bfqd->queue->queue_lock);

4973

++

4974

++	return num_char;

4975

++}

4976

++

4977

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4978

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4979

++{									\

4980

++	struct bfq_data *bfqd = e->elevator_data;			\

4981

++	unsigned int __data = __VAR;					\

4982

++	if (__CONV)							\

4983

++		__data = jiffies_to_msecs(__data);			\

4984

++	return bfq_var_show(__data, (page));				\

4985

++}

4986

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4987

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4988

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4989

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4990

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4991

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4992

++SHOW_FUNCTION(bfq_max_budget_async_rq_show,

4993

++	      bfqd->bfq_max_budget_async_rq, 0);

4994

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4995

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4996

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4997

++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

4998

++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

4999

++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);

5000

++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,

5001

++	1);

5002

++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);

5003

++#undef SHOW_FUNCTION

5004

++

5005

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

5006

++static ssize_t								\

5007

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

5008

++{									\

5009

++	struct bfq_data *bfqd = e->elevator_data;			\

5010

++	unsigned long uninitialized_var(__data);			\

5011

++	int ret = bfq_var_store(&__data, (page), count);		\

5012

++	if (__data < (MIN))						\

5013

++		__data = (MIN);						\

5014

++	else if (__data > (MAX))					\

5015

++		__data = (MAX);						\

5016

++	if (__CONV)							\

5017

++		*(__PTR) = msecs_to_jiffies(__data);			\

5018

++	else								\

5019

++		*(__PTR) = __data;					\

5020

++	return ret;							\

5021

++}

5022

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

5023

++		INT_MAX, 1);

5024

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

5025

++		INT_MAX, 1);

5026

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

5027

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

5028

++		INT_MAX, 0);

5029

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

5030

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

5031

++		1, INT_MAX, 0);

5032

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

5033

++		INT_MAX, 1);

5034

++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

5035

++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

5036

++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

5037

++		1);

5038

++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,

5039

++		INT_MAX, 1);

5040

++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,

5041

++		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);

5042

++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,

5043

++		INT_MAX, 0);

5044

++#undef STORE_FUNCTION

5045

++

5046

++/* do nothing for the moment */

5047

++static ssize_t bfq_weights_store(struct elevator_queue *e,

5048

++				    const char *page, size_t count)

5049

++{

5050

++	return count;

5051

++}

5052

++

5053

++static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

5054

++{

5055

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

5056

++

5057

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

5058

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

5059

++	else

5060

++		return bfq_default_max_budget;

5061

++}

5062

++

5063

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

5064

++				    const char *page, size_t count)

5065

++{

5066

++	struct bfq_data *bfqd = e->elevator_data;

5067

++	unsigned long uninitialized_var(__data);

5068

++	int ret = bfq_var_store(&__data, (page), count);

5069

++

5070

++	if (__data == 0)

5071

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

5072

++	else {

5073

++		if (__data > INT_MAX)

5074

++			__data = INT_MAX;

5075

++		bfqd->bfq_max_budget = __data;

5076

++	}

5077

++

5078

++	bfqd->bfq_user_max_budget = __data;

5079

++

5080

++	return ret;

5081

++}

5082

++

5083

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

5084

++				      const char *page, size_t count)

5085

++{

5086

++	struct bfq_data *bfqd = e->elevator_data;

5087

++	unsigned long uninitialized_var(__data);

5088

++	int ret = bfq_var_store(&__data, (page), count);

5089

++

5090

++	if (__data < 1)

5091

++		__data = 1;

5092

++	else if (__data > INT_MAX)

5093

++		__data = INT_MAX;

5094

++

5095

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

5096

++	if (bfqd->bfq_user_max_budget == 0)

5097

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

5098

++

5099

++	return ret;

5100

++}

5101

++

5102

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

5103

++				     const char *page, size_t count)

5104

++{

5105

++	struct bfq_data *bfqd = e->elevator_data;

5106

++	unsigned long uninitialized_var(__data);

5107

++	int ret = bfq_var_store(&__data, (page), count);

5108

++

5109

++	if (__data > 1)

5110

++		__data = 1;

5111

++	if (__data == 0 && bfqd->low_latency != 0)

5112

++		bfq_end_wr(bfqd);

5113

++	bfqd->low_latency = __data;

5114

++

5115

++	return ret;

5116

++}

5117

++

5118

++#define BFQ_ATTR(name) \

5119

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

5120

++

5121

++static struct elv_fs_entry bfq_attrs[] = {

5122

++	BFQ_ATTR(fifo_expire_sync),

5123

++	BFQ_ATTR(fifo_expire_async),

5124

++	BFQ_ATTR(back_seek_max),

5125

++	BFQ_ATTR(back_seek_penalty),

5126

++	BFQ_ATTR(slice_idle),

5127

++	BFQ_ATTR(max_budget),

5128

++	BFQ_ATTR(max_budget_async_rq),

5129

++	BFQ_ATTR(timeout_sync),

5130

++	BFQ_ATTR(timeout_async),

5131

++	BFQ_ATTR(low_latency),

5132

++	BFQ_ATTR(wr_coeff),

5133

++	BFQ_ATTR(wr_max_time),

5134

++	BFQ_ATTR(wr_rt_max_time),

5135

++	BFQ_ATTR(wr_min_idle_time),

5136

++	BFQ_ATTR(wr_min_inter_arr_async),

5137

++	BFQ_ATTR(wr_max_softrt_rate),

5138

++	BFQ_ATTR(weights),

5139

++	__ATTR_NULL

5140

++};

5141

++

5142

++static struct elevator_type iosched_bfq = {

5143

++	.ops = {

5144

++		.elevator_merge_fn =		bfq_merge,

5145

++		.elevator_merged_fn =		bfq_merged_request,

5146

++		.elevator_merge_req_fn =	bfq_merged_requests,

5147

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5148

++		.elevator_bio_merged_fn =	bfq_bio_merged,

5149

++#endif

5150

++		.elevator_allow_merge_fn =	bfq_allow_merge,

5151

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

5152

++		.elevator_add_req_fn =		bfq_insert_request,

5153

++		.elevator_activate_req_fn =	bfq_activate_request,

5154

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

5155

++		.elevator_completed_req_fn =	bfq_completed_request,

5156

++		.elevator_former_req_fn =	elv_rb_former_request,

5157

++		.elevator_latter_req_fn =	elv_rb_latter_request,

5158

++		.elevator_init_icq_fn =		bfq_init_icq,

5159

++		.elevator_exit_icq_fn =		bfq_exit_icq,

5160

++		.elevator_set_req_fn =		bfq_set_request,

5161

++		.elevator_put_req_fn =		bfq_put_request,

5162

++		.elevator_may_queue_fn =	bfq_may_queue,

5163

++		.elevator_init_fn =		bfq_init_queue,

5164

++		.elevator_exit_fn =		bfq_exit_queue,

5165

++	},

5166

++	.icq_size =		sizeof(struct bfq_io_cq),

5167

++	.icq_align =		__alignof__(struct bfq_io_cq),

5168

++	.elevator_attrs =	bfq_attrs,

5169

++	.elevator_name =	"bfq",

5170

++	.elevator_owner =	THIS_MODULE,

5171

++};

5172

++

5173

++static int __init bfq_init(void)

5174

++{

5175

++	int ret;

5176

++

5177

++	/*

5178

++	 * Can be 0 on HZ < 1000 setups.

5179

++	 */

5180

++	if (bfq_slice_idle == 0)

5181

++		bfq_slice_idle = 1;

5182

++

5183

++	if (bfq_timeout_async == 0)

5184

++		bfq_timeout_async = 1;

5185

++

5186

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5187

++	ret = blkcg_policy_register(&blkcg_policy_bfq);

5188

++	if (ret)

5189

++		return ret;

5190

++#endif

5191

++

5192

++	ret = -ENOMEM;

5193

++	if (bfq_slab_setup())

5194

++		goto err_pol_unreg;

5195

++

5196

++	/*

5197

++	 * Times to load large popular applications for the typical systems

5198

++	 * installed on the reference devices (see the comments before the

5199

++	 * definitions of the two arrays).

5200

++	 */

5201

++	T_slow[0] = msecs_to_jiffies(2600);

5202

++	T_slow[1] = msecs_to_jiffies(1000);

5203

++	T_fast[0] = msecs_to_jiffies(5500);

5204

++	T_fast[1] = msecs_to_jiffies(2000);

5205

++

5206

++	/*

5207

++	 * Thresholds that determine the switch between speed classes (see

5208

++	 * the comments before the definition of the array).

5209

++	 */

5210

++	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

5211

++	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

5212

++

5213

++	ret = elv_register(&iosched_bfq);

5214

++	if (ret)

5215

++		goto err_pol_unreg;

5216

++

5217

++	pr_info("BFQ I/O-scheduler: v7r11");

5218

++

5219

++	return 0;

5220

++

5221

++err_pol_unreg:

5222

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5223

++	blkcg_policy_unregister(&blkcg_policy_bfq);

5224

++#endif

5225

++	return ret;

5226

++}

5227

++

5228

++static void __exit bfq_exit(void)

5229

++{

5230

++	elv_unregister(&iosched_bfq);

5231

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5232

++	blkcg_policy_unregister(&blkcg_policy_bfq);

5233

++#endif

5234

++	bfq_slab_kill();

5235

++}

5236

++

5237

++module_init(bfq_init);

5238

++module_exit(bfq_exit);

5239

++

5240

++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");

5241

++MODULE_LICENSE("GPL");

5242

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

5243

+new file mode 100644

5244

+index 0000000..a64fec1

5245

+--- /dev/null

5246

++++ b/block/bfq-sched.c

5247

+@@ -0,0 +1,1200 @@

5248

++/*

5249

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

5250

++ *

5251

++ * Based on ideas and code from CFQ:

5252

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5253

++ *

5254

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5255

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5256

++ *

5257

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5258

++ */

5259

++

5260

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5261

++#define for_each_entity(entity)	\

5262

++	for (; entity ; entity = entity->parent)

5263

++

5264

++#define for_each_entity_safe(entity, parent) \

5265

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

5266

++

5267

++

5268

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5269

++						 int extract,

5270

++						 struct bfq_data *bfqd);

5271

++

5272

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);

5273

++

5274

++static void bfq_update_budget(struct bfq_entity *next_in_service)

5275

++{

5276

++	struct bfq_entity *bfqg_entity;

5277

++	struct bfq_group *bfqg;

5278

++	struct bfq_sched_data *group_sd;

5279

++

5280

++	BUG_ON(!next_in_service);

5281

++

5282

++	group_sd = next_in_service->sched_data;

5283

++

5284

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

5285

++	/*

5286

++	 * bfq_group's my_entity field is not NULL only if the group

5287

++	 * is not the root group. We must not touch the root entity

5288

++	 * as it must never become an in-service entity.

5289

++	 */

5290

++	bfqg_entity = bfqg->my_entity;

5291

++	if (bfqg_entity)

5292

++		bfqg_entity->budget = next_in_service->budget;

5293

++}

5294

++

5295

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5296

++{

5297

++	struct bfq_entity *next_in_service;

5298

++

5299

++	if (sd->in_service_entity)

5300

++		/* will update/requeue at the end of service */

5301

++		return 0;

5302

++

5303

++	/*

5304

++	 * NOTE: this can be improved in many ways, such as returning

5305

++	 * 1 (and thus propagating upwards the update) only when the

5306

++	 * budget changes, or caching the bfqq that will be scheduled

5307

++	 * next from this subtree.  By now we worry more about

5308

++	 * correctness than about performance...

5309

++	 */

5310

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

5311

++	sd->next_in_service = next_in_service;

5312

++

5313

++	if (next_in_service)

5314

++		bfq_update_budget(next_in_service);

5315

++

5316

++	return 1;

5317

++}

5318

++

5319

++static void bfq_check_next_in_service(struct bfq_sched_data *sd,

5320

++				      struct bfq_entity *entity)

5321

++{

5322

++	BUG_ON(sd->next_in_service != entity);

5323

++}

5324

++#else

5325

++#define for_each_entity(entity)	\

5326

++	for (; entity ; entity = NULL)

5327

++

5328

++#define for_each_entity_safe(entity, parent) \

5329

++	for (parent = NULL; entity ; entity = parent)

5330

++

5331

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5332

++{

5333

++	return 0;

5334

++}

5335

++

5336

++static void bfq_check_next_in_service(struct bfq_sched_data *sd,

5337

++				      struct bfq_entity *entity)

5338

++{

5339

++}

5340

++

5341

++static void bfq_update_budget(struct bfq_entity *next_in_service)

5342

++{

5343

++}

5344

++#endif

5345

++

5346

++/*

5347

++ * Shift for timestamp calculations.  This actually limits the maximum

5348

++ * service allowed in one timestamp delta (small shift values increase it),

5349

++ * the maximum total weight that can be used for the queues in the system

5350

++ * (big shift values increase it), and the period of virtual time

5351

++ * wraparounds.

5352

++ */

5353

++#define WFQ_SERVICE_SHIFT	22

5354

++

5355

++/**

5356

++ * bfq_gt - compare two timestamps.

5357

++ * @a: first ts.

5358

++ * @b: second ts.

5359

++ *

5360

++ * Return @a > @b, dealing with wrapping correctly.

5361

++ */

5362

++static int bfq_gt(u64 a, u64 b)

5363

++{

5364

++	return (s64)(a - b) > 0;

5365

++}

5366

++

5367

++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

5368

++{

5369

++	struct bfq_queue *bfqq = NULL;

5370

++

5371

++	BUG_ON(!entity);

5372

++

5373

++	if (!entity->my_sched_data)

5374

++		bfqq = container_of(entity, struct bfq_queue, entity);

5375

++

5376

++	return bfqq;

5377

++}

5378

++

5379

++

5380

++/**

5381

++ * bfq_delta - map service into the virtual time domain.

5382

++ * @service: amount of service.

5383

++ * @weight: scale factor (weight of an entity or weight sum).

5384

++ */

5385

++static u64 bfq_delta(unsigned long service, unsigned long weight)

5386

++{

5387

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

5388

++

5389

++	do_div(d, weight);

5390

++	return d;

5391

++}

5392

++

5393

++/**

5394

++ * bfq_calc_finish - assign the finish time to an entity.

5395

++ * @entity: the entity to act upon.

5396

++ * @service: the service to be charged to the entity.

5397

++ */

5398

++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)

5399

++{

5400

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5401

++

5402

++	BUG_ON(entity->weight == 0);

5403

++

5404

++	entity->finish = entity->start +

5405

++		bfq_delta(service, entity->weight);

5406

++

5407

++	if (bfqq) {

5408

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5409

++			"calc_finish: serv %lu, w %d",

5410

++			service, entity->weight);

5411

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5412

++			"calc_finish: start %llu, finish %llu, delta %llu",

5413

++			entity->start, entity->finish,

5414

++			bfq_delta(service, entity->weight));

5415

++	}

5416

++}

5417

++

5418

++/**

5419

++ * bfq_entity_of - get an entity from a node.

5420

++ * @node: the node field of the entity.

5421

++ *

5422

++ * Convert a node pointer to the relative entity.  This is used only

5423

++ * to simplify the logic of some functions and not as the generic

5424

++ * conversion mechanism because, e.g., in the tree walking functions,

5425

++ * the check for a %NULL value would be redundant.

5426

++ */

5427

++static struct bfq_entity *bfq_entity_of(struct rb_node *node)

5428

++{

5429

++	struct bfq_entity *entity = NULL;

5430

++

5431

++	if (node)

5432

++		entity = rb_entry(node, struct bfq_entity, rb_node);

5433

++

5434

++	return entity;

5435

++}

5436

++

5437

++/**

5438

++ * bfq_extract - remove an entity from a tree.

5439

++ * @root: the tree root.

5440

++ * @entity: the entity to remove.

5441

++ */

5442

++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)

5443

++{

5444

++	BUG_ON(entity->tree != root);

5445

++

5446

++	entity->tree = NULL;

5447

++	rb_erase(&entity->rb_node, root);

5448

++}

5449

++

5450

++/**

5451

++ * bfq_idle_extract - extract an entity from the idle tree.

5452

++ * @st: the service tree of the owning @entity.

5453

++ * @entity: the entity being removed.

5454

++ */

5455

++static void bfq_idle_extract(struct bfq_service_tree *st,

5456

++			     struct bfq_entity *entity)

5457

++{

5458

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5459

++	struct rb_node *next;

5460

++

5461

++	BUG_ON(entity->tree != &st->idle);

5462

++

5463

++	if (entity == st->first_idle) {

5464

++		next = rb_next(&entity->rb_node);

5465

++		st->first_idle = bfq_entity_of(next);

5466

++	}

5467

++

5468

++	if (entity == st->last_idle) {

5469

++		next = rb_prev(&entity->rb_node);

5470

++		st->last_idle = bfq_entity_of(next);

5471

++	}

5472

++

5473

++	bfq_extract(&st->idle, entity);

5474

++

5475

++	if (bfqq)

5476

++		list_del(&bfqq->bfqq_list);

5477

++}

5478

++

5479

++/**

5480

++ * bfq_insert - generic tree insertion.

5481

++ * @root: tree root.

5482

++ * @entity: entity to insert.

5483

++ *

5484

++ * This is used for the idle and the active tree, since they are both

5485

++ * ordered by finish time.

5486

++ */

5487

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

5488

++{

5489

++	struct bfq_entity *entry;

5490

++	struct rb_node **node = &root->rb_node;

5491

++	struct rb_node *parent = NULL;

5492

++

5493

++	BUG_ON(entity->tree);

5494

++

5495

++	while (*node) {

5496

++		parent = *node;

5497

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

5498

++

5499

++		if (bfq_gt(entry->finish, entity->finish))

5500

++			node = &parent->rb_left;

5501

++		else

5502

++			node = &parent->rb_right;

5503

++	}

5504

++

5505

++	rb_link_node(&entity->rb_node, parent, node);

5506

++	rb_insert_color(&entity->rb_node, root);

5507

++

5508

++	entity->tree = root;

5509

++}

5510

++

5511

++/**

5512

++ * bfq_update_min - update the min_start field of a entity.

5513

++ * @entity: the entity to update.

5514

++ * @node: one of its children.

5515

++ *

5516

++ * This function is called when @entity may store an invalid value for

5517

++ * min_start due to updates to the active tree.  The function  assumes

5518

++ * that the subtree rooted at @node (which may be its left or its right

5519

++ * child) has a valid min_start value.

5520

++ */

5521

++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)

5522

++{

5523

++	struct bfq_entity *child;

5524

++

5525

++	if (node) {

5526

++		child = rb_entry(node, struct bfq_entity, rb_node);

5527

++		if (bfq_gt(entity->min_start, child->min_start))

5528

++			entity->min_start = child->min_start;

5529

++	}

5530

++}

5531

++

5532

++/**

5533

++ * bfq_update_active_node - recalculate min_start.

5534

++ * @node: the node to update.

5535

++ *

5536

++ * @node may have changed position or one of its children may have moved,

5537

++ * this function updates its min_start value.  The left and right subtrees

5538

++ * are assumed to hold a correct min_start value.

5539

++ */

5540

++static void bfq_update_active_node(struct rb_node *node)

5541

++{

5542

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

5543

++

5544

++	entity->min_start = entity->start;

5545

++	bfq_update_min(entity, node->rb_right);

5546

++	bfq_update_min(entity, node->rb_left);

5547

++}

5548

++

5549

++/**

5550

++ * bfq_update_active_tree - update min_start for the whole active tree.

5551

++ * @node: the starting node.

5552

++ *

5553

++ * @node must be the deepest modified node after an update.  This function

5554

++ * updates its min_start using the values held by its children, assuming

5555

++ * that they did not change, and then updates all the nodes that may have

5556

++ * changed in the path to the root.  The only nodes that may have changed

5557

++ * are the ones in the path or their siblings.

5558

++ */

5559

++static void bfq_update_active_tree(struct rb_node *node)

5560

++{

5561

++	struct rb_node *parent;

5562

++

5563

++up:

5564

++	bfq_update_active_node(node);

5565

++

5566

++	parent = rb_parent(node);

5567

++	if (!parent)

5568

++		return;

5569

++

5570

++	if (node == parent->rb_left && parent->rb_right)

5571

++		bfq_update_active_node(parent->rb_right);

5572

++	else if (parent->rb_left)

5573

++		bfq_update_active_node(parent->rb_left);

5574

++

5575

++	node = parent;

5576

++	goto up;

5577

++}

5578

++

5579

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

5580

++				 struct bfq_entity *entity,

5581

++				 struct rb_root *root);

5582

++

5583

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

5584

++				    struct bfq_entity *entity,

5585

++				    struct rb_root *root);

5586

++

5587

++

5588

++/**

5589

++ * bfq_active_insert - insert an entity in the active tree of its

5590

++ *                     group/device.

5591

++ * @st: the service tree of the entity.

5592

++ * @entity: the entity being inserted.

5593

++ *

5594

++ * The active tree is ordered by finish time, but an extra key is kept

5595

++ * per each node, containing the minimum value for the start times of

5596

++ * its children (and the node itself), so it's possible to search for

5597

++ * the eligible node with the lowest finish time in logarithmic time.

5598

++ */

5599

++static void bfq_active_insert(struct bfq_service_tree *st,

5600

++			      struct bfq_entity *entity)

5601

++{

5602

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5603

++	struct rb_node *node = &entity->rb_node;

5604

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5605

++	struct bfq_sched_data *sd = NULL;

5606

++	struct bfq_group *bfqg = NULL;

5607

++	struct bfq_data *bfqd = NULL;

5608

++#endif

5609

++

5610

++	bfq_insert(&st->active, entity);

5611

++

5612

++	if (node->rb_left)

5613

++		node = node->rb_left;

5614

++	else if (node->rb_right)

5615

++		node = node->rb_right;

5616

++

5617

++	bfq_update_active_tree(node);

5618

++

5619

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5620

++	sd = entity->sched_data;

5621

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5622

++	BUG_ON(!bfqg);

5623

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5624

++#endif

5625

++	if (bfqq)

5626

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

5627

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5628

++	else { /* bfq_group */

5629

++		BUG_ON(!bfqd);

5630

++		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);

5631

++	}

5632

++	if (bfqg != bfqd->root_group) {

5633

++		BUG_ON(!bfqg);

5634

++		BUG_ON(!bfqd);

5635

++		bfqg->active_entities++;

5636

++		if (bfqg->active_entities == 2)

5637

++			bfqd->active_numerous_groups++;

5638

++	}

5639

++#endif

5640

++}

5641

++

5642

++/**

5643

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

5644

++ * @ioprio: the ioprio value to convert.

5645

++ */

5646

++static unsigned short bfq_ioprio_to_weight(int ioprio)

5647

++{

5648

++	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

5649

++	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;

5650

++}

5651

++

5652

++/**

5653

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

5654

++ * @weight: the weight value to convert.

5655

++ *

5656

++ * To preserve as much as possible the old only-ioprio user interface,

5657

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

5658

++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.

5659

++ */

5660

++static unsigned short bfq_weight_to_ioprio(int weight)

5661

++{

5662

++	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

5663

++	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?

5664

++		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;

5665

++}

5666

++

5667

++static void bfq_get_entity(struct bfq_entity *entity)

5668

++{

5669

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5670

++

5671

++	if (bfqq) {

5672

++		atomic_inc(&bfqq->ref);

5673

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

5674

++			     bfqq, atomic_read(&bfqq->ref));

5675

++	}

5676

++}

5677

++

5678

++/**

5679

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

5680

++ * @node: the node being removed.

5681

++ *

5682

++ * Do the first step of an extraction in an rb tree, looking for the

5683

++ * node that will replace @node, and returning the deepest node that

5684

++ * the following modifications to the tree can touch.  If @node is the

5685

++ * last node in the tree return %NULL.

5686

++ */

5687

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

5688

++{

5689

++	struct rb_node *deepest;

5690

++

5691

++	if (!node->rb_right && !node->rb_left)

5692

++		deepest = rb_parent(node);

5693

++	else if (!node->rb_right)

5694

++		deepest = node->rb_left;

5695

++	else if (!node->rb_left)

5696

++		deepest = node->rb_right;

5697

++	else {

5698

++		deepest = rb_next(node);

5699

++		if (deepest->rb_right)

5700

++			deepest = deepest->rb_right;

5701

++		else if (rb_parent(deepest) != node)

5702

++			deepest = rb_parent(deepest);

5703

++	}

5704

++

5705

++	return deepest;

5706

++}

5707

++

5708

++/**

5709

++ * bfq_active_extract - remove an entity from the active tree.

5710

++ * @st: the service_tree containing the tree.

5711

++ * @entity: the entity being removed.

5712

++ */

5713

++static void bfq_active_extract(struct bfq_service_tree *st,

5714

++			       struct bfq_entity *entity)

5715

++{

5716

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5717

++	struct rb_node *node;

5718

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5719

++	struct bfq_sched_data *sd = NULL;

5720

++	struct bfq_group *bfqg = NULL;

5721

++	struct bfq_data *bfqd = NULL;

5722

++#endif

5723

++

5724

++	node = bfq_find_deepest(&entity->rb_node);

5725

++	bfq_extract(&st->active, entity);

5726

++

5727

++	if (node)

5728

++		bfq_update_active_tree(node);

5729

++

5730

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5731

++	sd = entity->sched_data;

5732

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5733

++	BUG_ON(!bfqg);

5734

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5735

++#endif

5736

++	if (bfqq)

5737

++		list_del(&bfqq->bfqq_list);

5738

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5739

++	else { /* bfq_group */

5740

++		BUG_ON(!bfqd);

5741

++		bfq_weights_tree_remove(bfqd, entity,

5742

++					&bfqd->group_weights_tree);

5743

++	}

5744

++	if (bfqg != bfqd->root_group) {

5745

++		BUG_ON(!bfqg);

5746

++		BUG_ON(!bfqd);

5747

++		BUG_ON(!bfqg->active_entities);

5748

++		bfqg->active_entities--;

5749

++		if (bfqg->active_entities == 1) {

5750

++			BUG_ON(!bfqd->active_numerous_groups);

5751

++			bfqd->active_numerous_groups--;

5752

++		}

5753

++	}

5754

++#endif

5755

++}

5756

++

5757

++/**

5758

++ * bfq_idle_insert - insert an entity into the idle tree.

5759

++ * @st: the service tree containing the tree.

5760

++ * @entity: the entity to insert.

5761

++ */

5762

++static void bfq_idle_insert(struct bfq_service_tree *st,

5763

++			    struct bfq_entity *entity)

5764

++{

5765

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5766

++	struct bfq_entity *first_idle = st->first_idle;

5767

++	struct bfq_entity *last_idle = st->last_idle;

5768

++

5769

++	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))

5770

++		st->first_idle = entity;

5771

++	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))

5772

++		st->last_idle = entity;

5773

++

5774

++	bfq_insert(&st->idle, entity);

5775

++

5776

++	if (bfqq)

5777

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5778

++}

5779

++

5780

++/**

5781

++ * bfq_forget_entity - remove an entity from the wfq trees.

5782

++ * @st: the service tree.

5783

++ * @entity: the entity being removed.

5784

++ *

5785

++ * Update the device status and forget everything about @entity, putting

5786

++ * the device reference to it, if it is a queue.  Entities belonging to

5787

++ * groups are not refcounted.

5788

++ */

5789

++static void bfq_forget_entity(struct bfq_service_tree *st,

5790

++			      struct bfq_entity *entity)

5791

++{

5792

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5793

++	struct bfq_sched_data *sd;

5794

++

5795

++	BUG_ON(!entity->on_st);

5796

++

5797

++	entity->on_st = 0;

5798

++	st->wsum -= entity->weight;

5799

++	if (bfqq) {

5800

++		sd = entity->sched_data;

5801

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5802

++			     bfqq, atomic_read(&bfqq->ref));

5803

++		bfq_put_queue(bfqq);

5804

++	}

5805

++}

5806

++

5807

++/**

5808

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5809

++ * @st: service tree for the entity.

5810

++ * @entity: the entity being released.

5811

++ */

5812

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5813

++				struct bfq_entity *entity)

5814

++{

5815

++	bfq_idle_extract(st, entity);

5816

++	bfq_forget_entity(st, entity);

5817

++}

5818

++

5819

++/**

5820

++ * bfq_forget_idle - update the idle tree if necessary.

5821

++ * @st: the service tree to act upon.

5822

++ *

5823

++ * To preserve the global O(log N) complexity we only remove one entry here;

5824

++ * as the idle tree will not grow indefinitely this can be done safely.

5825

++ */

5826

++static void bfq_forget_idle(struct bfq_service_tree *st)

5827

++{

5828

++	struct bfq_entity *first_idle = st->first_idle;

5829

++	struct bfq_entity *last_idle = st->last_idle;

5830

++

5831

++	if (RB_EMPTY_ROOT(&st->active) && last_idle &&

5832

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5833

++		/*

5834

++		 * Forget the whole idle tree, increasing the vtime past

5835

++		 * the last finish time of idle entities.

5836

++		 */

5837

++		st->vtime = last_idle->finish;

5838

++	}

5839

++

5840

++	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))

5841

++		bfq_put_idle_entity(st, first_idle);

5842

++}

5843

++

5844

++static struct bfq_service_tree *

5845

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5846

++			 struct bfq_entity *entity)

5847

++{

5848

++	struct bfq_service_tree *new_st = old_st;

5849

++

5850

++	if (entity->prio_changed) {

5851

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5852

++		unsigned short prev_weight, new_weight;

5853

++		struct bfq_data *bfqd = NULL;

5854

++		struct rb_root *root;

5855

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5856

++		struct bfq_sched_data *sd;

5857

++		struct bfq_group *bfqg;

5858

++#endif

5859

++

5860

++		if (bfqq)

5861

++			bfqd = bfqq->bfqd;

5862

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5863

++		else {

5864

++			sd = entity->my_sched_data;

5865

++			bfqg = container_of(sd, struct bfq_group, sched_data);

5866

++			BUG_ON(!bfqg);

5867

++			bfqd = (struct bfq_data *)bfqg->bfqd;

5868

++			BUG_ON(!bfqd);

5869

++		}

5870

++#endif

5871

++

5872

++		BUG_ON(old_st->wsum < entity->weight);

5873

++		old_st->wsum -= entity->weight;

5874

++

5875

++		if (entity->new_weight != entity->orig_weight) {

5876

++			if (entity->new_weight < BFQ_MIN_WEIGHT ||

5877

++			    entity->new_weight > BFQ_MAX_WEIGHT) {

5878

++				printk(KERN_CRIT "update_weight_prio: "

5879

++						 "new_weight %d\n",

5880

++					entity->new_weight);

5881

++				BUG();

5882

++			}

5883

++			entity->orig_weight = entity->new_weight;

5884

++			if (bfqq)

5885

++				bfqq->ioprio =

5886

++				  bfq_weight_to_ioprio(entity->orig_weight);

5887

++		}

5888

++

5889

++		if (bfqq)

5890

++			bfqq->ioprio_class = bfqq->new_ioprio_class;

5891

++		entity->prio_changed = 0;

5892

++

5893

++		/*

5894

++		 * NOTE: here we may be changing the weight too early,

5895

++		 * this will cause unfairness.  The correct approach

5896

++		 * would have required additional complexity to defer

5897

++		 * weight changes to the proper time instants (i.e.,

5898

++		 * when entity->finish <= old_st->vtime).

5899

++		 */

5900

++		new_st = bfq_entity_service_tree(entity);

5901

++

5902

++		prev_weight = entity->weight;

5903

++		new_weight = entity->orig_weight *

5904

++			     (bfqq ? bfqq->wr_coeff : 1);

5905

++		/*

5906

++		 * If the weight of the entity changes, remove the entity

5907

++		 * from its old weight counter (if there is a counter

5908

++		 * associated with the entity), and add it to the counter

5909

++		 * associated with its new weight.

5910

++		 */

5911

++		if (prev_weight != new_weight) {

5912

++			root = bfqq ? &bfqd->queue_weights_tree :

5913

++				      &bfqd->group_weights_tree;

5914

++			bfq_weights_tree_remove(bfqd, entity, root);

5915

++		}

5916

++		entity->weight = new_weight;

5917

++		/*

5918

++		 * Add the entity to its weights tree only if it is

5919

++		 * not associated with a weight-raised queue.

5920

++		 */

5921

++		if (prev_weight != new_weight &&

5922

++		    (bfqq ? bfqq->wr_coeff == 1 : 1))

5923

++			/* If we get here, root has been initialized. */

5924

++			bfq_weights_tree_add(bfqd, entity, root);

5925

++

5926

++		new_st->wsum += entity->weight;

5927

++

5928

++		if (new_st != old_st)

5929

++			entity->start = new_st->vtime;

5930

++	}

5931

++

5932

++	return new_st;

5933

++}

5934

++

5935

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5936

++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);

5937

++#endif

5938

++

5939

++/**

5940

++ * bfq_bfqq_served - update the scheduler status after selection for

5941

++ *                   service.

5942

++ * @bfqq: the queue being served.

5943

++ * @served: bytes to transfer.

5944

++ *

5945

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5946

++ * are synchronized every time a new bfqq is selected for service.  By now,

5947

++ * we keep it to better check consistency.

5948

++ */

5949

++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)

5950

++{

5951

++	struct bfq_entity *entity = &bfqq->entity;

5952

++	struct bfq_service_tree *st;

5953

++

5954

++	for_each_entity(entity) {

5955

++		st = bfq_entity_service_tree(entity);

5956

++

5957

++		entity->service += served;

5958

++		BUG_ON(entity->service > entity->budget);

5959

++		BUG_ON(st->wsum == 0);

5960

++

5961

++		st->vtime += bfq_delta(served, st->wsum);

5962

++		bfq_forget_idle(st);

5963

++	}

5964

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5965

++	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));

5966

++#endif

5967

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);

5968

++}

5969

++

5970

++/**

5971

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5972

++ * @bfqq: the queue that needs a service update.

5973

++ *

5974

++ * When it's not possible to be fair in the service domain, because

5975

++ * a queue is not consuming its budget fast enough (the meaning of

5976

++ * fast depends on the timeout parameter), we charge it a full

5977

++ * budget.  In this way we should obtain a sort of time-domain

5978

++ * fairness among all the seeky/slow queues.

5979

++ */

5980

++static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5981

++{

5982

++	struct bfq_entity *entity = &bfqq->entity;

5983

++

5984

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5985

++

5986

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5987

++}

5988

++

5989

++/**

5990

++ * __bfq_activate_entity - activate an entity.

5991

++ * @entity: the entity being activated.

5992

++ *

5993

++ * Called whenever an entity is activated, i.e., it is not active and one

5994

++ * of its children receives a new request, or has to be reactivated due to

5995

++ * budget exhaustion.  It uses the current budget of the entity (and the

5996

++ * service received if @entity is active) of the queue to calculate its

5997

++ * timestamps.

5998

++ */

5999

++static void __bfq_activate_entity(struct bfq_entity *entity)

6000

++{

6001

++	struct bfq_sched_data *sd = entity->sched_data;

6002

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

6003

++

6004

++	if (entity == sd->in_service_entity) {

6005

++		BUG_ON(entity->tree);

6006

++		/*

6007

++		 * If we are requeueing the current entity we have

6008

++		 * to take care of not charging to it service it has

6009

++		 * not received.

6010

++		 */

6011

++		bfq_calc_finish(entity, entity->service);

6012

++		entity->start = entity->finish;

6013

++		sd->in_service_entity = NULL;

6014

++	} else if (entity->tree == &st->active) {

6015

++		/*

6016

++		 * Requeueing an entity due to a change of some

6017

++		 * next_in_service entity below it.  We reuse the

6018

++		 * old start time.

6019

++		 */

6020

++		bfq_active_extract(st, entity);

6021

++	} else if (entity->tree == &st->idle) {

6022

++		/*

6023

++		 * Must be on the idle tree, bfq_idle_extract() will

6024

++		 * check for that.

6025

++		 */

6026

++		bfq_idle_extract(st, entity);

6027

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

6028

++				       st->vtime : entity->finish;

6029

++	} else {

6030

++		/*

6031

++		 * The finish time of the entity may be invalid, and

6032

++		 * it is in the past for sure, otherwise the queue

6033

++		 * would have been on the idle tree.

6034

++		 */

6035

++		entity->start = st->vtime;

6036

++		st->wsum += entity->weight;

6037

++		bfq_get_entity(entity);

6038

++

6039

++		BUG_ON(entity->on_st);

6040

++		entity->on_st = 1;

6041

++	}

6042

++

6043

++	st = __bfq_entity_update_weight_prio(st, entity);

6044

++	bfq_calc_finish(entity, entity->budget);

6045

++	bfq_active_insert(st, entity);

6046

++}

6047

++

6048

++/**

6049

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

6050

++ * @entity: the entity to activate.

6051

++ *

6052

++ * Activate @entity and all the entities on the path from it to the root.

6053

++ */

6054

++static void bfq_activate_entity(struct bfq_entity *entity)

6055

++{

6056

++	struct bfq_sched_data *sd;

6057

++

6058

++	for_each_entity(entity) {

6059

++		__bfq_activate_entity(entity);

6060

++

6061

++		sd = entity->sched_data;

6062

++		if (!bfq_update_next_in_service(sd))

6063

++			/*

6064

++			 * No need to propagate the activation to the

6065

++			 * upper entities, as they will be updated when

6066

++			 * the in-service entity is rescheduled.

6067

++			 */

6068

++			break;

6069

++	}

6070

++}

6071

++

6072

++/**

6073

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

6074

++ * @entity: the entity to deactivate.

6075

++ * @requeue: if false, the entity will not be put into the idle tree.

6076

++ *

6077

++ * Deactivate an entity, independently from its previous state.  If the

6078

++ * entity was not on a service tree just return, otherwise if it is on

6079

++ * any scheduler tree, extract it from that tree, and if necessary

6080

++ * and if the caller did not specify @requeue, put it on the idle tree.

6081

++ *

6082

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

6083

++ * if the entity was in service or if it was the next_in_service for

6084

++ * its sched_data; return %0 otherwise.

6085

++ */

6086

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

6087

++{

6088

++	struct bfq_sched_data *sd = entity->sched_data;

6089

++	struct bfq_service_tree *st;

6090

++	int was_in_service;

6091

++	int ret = 0;

6092

++

6093

++	if (sd == NULL || !entity->on_st) /* never activated, or inactive */

6094

++		return 0;

6095

++

6096

++	st = bfq_entity_service_tree(entity);

6097

++	was_in_service = entity == sd->in_service_entity;

6098

++

6099

++	BUG_ON(was_in_service && entity->tree);

6100

++

6101

++	if (was_in_service) {

6102

++		bfq_calc_finish(entity, entity->service);

6103

++		sd->in_service_entity = NULL;

6104

++	} else if (entity->tree == &st->active)

6105

++		bfq_active_extract(st, entity);

6106

++	else if (entity->tree == &st->idle)

6107

++		bfq_idle_extract(st, entity);

6108

++	else if (entity->tree)

6109

++		BUG();

6110

++

6111

++	if (was_in_service || sd->next_in_service == entity)

6112

++		ret = bfq_update_next_in_service(sd);

6113

++

6114

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

6115

++		bfq_forget_entity(st, entity);

6116

++	else

6117

++		bfq_idle_insert(st, entity);

6118

++

6119

++	BUG_ON(sd->in_service_entity == entity);

6120

++	BUG_ON(sd->next_in_service == entity);

6121

++

6122

++	return ret;

6123

++}

6124

++

6125

++/**

6126

++ * bfq_deactivate_entity - deactivate an entity.

6127

++ * @entity: the entity to deactivate.

6128

++ * @requeue: true if the entity can be put on the idle tree

6129

++ */

6130

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

6131

++{

6132

++	struct bfq_sched_data *sd;

6133

++	struct bfq_entity *parent;

6134

++

6135

++	for_each_entity_safe(entity, parent) {

6136

++		sd = entity->sched_data;

6137

++

6138

++		if (!__bfq_deactivate_entity(entity, requeue))

6139

++			/*

6140

++			 * The parent entity is still backlogged, and

6141

++			 * we don't need to update it as it is still

6142

++			 * in service.

6143

++			 */

6144

++			break;

6145

++

6146

++		if (sd->next_in_service)

6147

++			/*

6148

++			 * The parent entity is still backlogged and

6149

++			 * the budgets on the path towards the root

6150

++			 * need to be updated.

6151

++			 */

6152

++			goto update;

6153

++

6154

++		/*

6155

++		 * If we reach there the parent is no more backlogged and

6156

++		 * we want to propagate the dequeue upwards.

6157

++		 */

6158

++		requeue = 1;

6159

++	}

6160

++

6161

++	return;

6162

++

6163

++update:

6164

++	entity = parent;

6165

++	for_each_entity(entity) {

6166

++		__bfq_activate_entity(entity);

6167

++

6168

++		sd = entity->sched_data;

6169

++		if (!bfq_update_next_in_service(sd))

6170

++			break;

6171

++	}

6172

++}

6173

++

6174

++/**

6175

++ * bfq_update_vtime - update vtime if necessary.

6176

++ * @st: the service tree to act upon.

6177

++ *

6178

++ * If necessary update the service tree vtime to have at least one

6179

++ * eligible entity, skipping to its start time.  Assumes that the

6180

++ * active tree of the device is not empty.

6181

++ *

6182

++ * NOTE: this hierarchical implementation updates vtimes quite often,

6183

++ * we may end up with reactivated processes getting timestamps after a

6184

++ * vtime skip done because we needed a ->first_active entity on some

6185

++ * intermediate node.

6186

++ */

6187

++static void bfq_update_vtime(struct bfq_service_tree *st)

6188

++{

6189

++	struct bfq_entity *entry;

6190

++	struct rb_node *node = st->active.rb_node;

6191

++

6192

++	entry = rb_entry(node, struct bfq_entity, rb_node);

6193

++	if (bfq_gt(entry->min_start, st->vtime)) {

6194

++		st->vtime = entry->min_start;

6195

++		bfq_forget_idle(st);

6196

++	}

6197

++}

6198

++

6199

++/**

6200

++ * bfq_first_active_entity - find the eligible entity with

6201

++ *                           the smallest finish time

6202

++ * @st: the service tree to select from.

6203

++ *

6204

++ * This function searches the first schedulable entity, starting from the

6205

++ * root of the tree and going on the left every time on this side there is

6206

++ * a subtree with at least one eligible (start >= vtime) entity. The path on

6207

++ * the right is followed only if a) the left subtree contains no eligible

6208

++ * entities and b) no eligible entity has been found yet.

6209

++ */

6210

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

6211

++{

6212

++	struct bfq_entity *entry, *first = NULL;

6213

++	struct rb_node *node = st->active.rb_node;

6214

++

6215

++	while (node) {

6216

++		entry = rb_entry(node, struct bfq_entity, rb_node);

6217

++left:

6218

++		if (!bfq_gt(entry->start, st->vtime))

6219

++			first = entry;

6220

++

6221

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

6222

++

6223

++		if (node->rb_left) {

6224

++			entry = rb_entry(node->rb_left,

6225

++					 struct bfq_entity, rb_node);

6226

++			if (!bfq_gt(entry->min_start, st->vtime)) {

6227

++				node = node->rb_left;

6228

++				goto left;

6229

++			}

6230

++		}

6231

++		if (first)

6232

++			break;

6233

++		node = node->rb_right;

6234

++	}

6235

++

6236

++	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));

6237

++	return first;

6238

++}

6239

++

6240

++/**

6241

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

6242

++ * @st: the service tree.

6243

++ *

6244

++ * Update the virtual time in @st and return the first eligible entity

6245

++ * it contains.

6246

++ */

6247

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

6248

++						   bool force)

6249

++{

6250

++	struct bfq_entity *entity, *new_next_in_service = NULL;

6251

++

6252

++	if (RB_EMPTY_ROOT(&st->active))

6253

++		return NULL;

6254

++

6255

++	bfq_update_vtime(st);

6256

++	entity = bfq_first_active_entity(st);

6257

++	BUG_ON(bfq_gt(entity->start, st->vtime));

6258

++

6259

++	/*

6260

++	 * If the chosen entity does not match with the sched_data's

6261

++	 * next_in_service and we are forcedly serving the IDLE priority

6262

++	 * class tree, bubble up budget update.

6263

++	 */

6264

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

6265

++		new_next_in_service = entity;

6266

++		for_each_entity(new_next_in_service)

6267

++			bfq_update_budget(new_next_in_service);

6268

++	}

6269

++

6270

++	return entity;

6271

++}

6272

++

6273

++/**

6274

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

6275

++ * @sd: the sched_data.

6276

++ * @extract: if true the returned entity will be also extracted from @sd.

6277

++ *

6278

++ * NOTE: since we cache the next_in_service entity at each level of the

6279

++ * hierarchy, the complexity of the lookup can be decreased with

6280

++ * absolutely no effort just returning the cached next_in_service value;

6281

++ * we prefer to do full lookups to test the consistency of * the data

6282

++ * structures.

6283

++ */

6284

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

6285

++						 int extract,

6286

++						 struct bfq_data *bfqd)

6287

++{

6288

++	struct bfq_service_tree *st = sd->service_tree;

6289

++	struct bfq_entity *entity;

6290

++	int i = 0;

6291

++

6292

++	BUG_ON(sd->in_service_entity);

6293

++

6294

++	if (bfqd &&

6295

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

6296

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

6297

++						  true);

6298

++		if (entity) {

6299

++			i = BFQ_IOPRIO_CLASSES - 1;

6300

++			bfqd->bfq_class_idle_last_service = jiffies;

6301

++			sd->next_in_service = entity;

6302

++		}

6303

++	}

6304

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

6305

++		entity = __bfq_lookup_next_entity(st + i, false);

6306

++		if (entity) {

6307

++			if (extract) {

6308

++				bfq_check_next_in_service(sd, entity);

6309

++				bfq_active_extract(st + i, entity);

6310

++				sd->in_service_entity = entity;

6311

++				sd->next_in_service = NULL;

6312

++			}

6313

++			break;

6314

++		}

6315

++	}

6316

++

6317

++	return entity;

6318

++}

6319

++

6320

++/*

6321

++ * Get next queue for service.

6322

++ */

6323

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6324

++{

6325

++	struct bfq_entity *entity = NULL;

6326

++	struct bfq_sched_data *sd;

6327

++	struct bfq_queue *bfqq;

6328

++

6329

++	BUG_ON(bfqd->in_service_queue);

6330

++

6331

++	if (bfqd->busy_queues == 0)

6332

++		return NULL;

6333

++

6334

++	sd = &bfqd->root_group->sched_data;

6335

++	for (; sd ; sd = entity->my_sched_data) {

6336

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

6337

++		BUG_ON(!entity);

6338

++		entity->service = 0;

6339

++	}

6340

++

6341

++	bfqq = bfq_entity_to_bfqq(entity);

6342

++	BUG_ON(!bfqq);

6343

++

6344

++	return bfqq;

6345

++}

6346

++

6347

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

6348

++{

6349

++	if (bfqd->in_service_bic) {

6350

++		put_io_context(bfqd->in_service_bic->icq.ioc);

6351

++		bfqd->in_service_bic = NULL;

6352

++	}

6353

++

6354

++	bfqd->in_service_queue = NULL;

6355

++	del_timer(&bfqd->idle_slice_timer);

6356

++}

6357

++

6358

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6359

++				int requeue)

6360

++{

6361

++	struct bfq_entity *entity = &bfqq->entity;

6362

++

6363

++	if (bfqq == bfqd->in_service_queue)

6364

++		__bfq_bfqd_reset_in_service(bfqd);

6365

++

6366

++	bfq_deactivate_entity(entity, requeue);

6367

++}

6368

++

6369

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6370

++{

6371

++	struct bfq_entity *entity = &bfqq->entity;

6372

++

6373

++	bfq_activate_entity(entity);

6374

++}

6375

++

6376

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6377

++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);

6378

++#endif

6379

++

6380

++/*

6381

++ * Called when the bfqq no longer has requests pending, remove it from

6382

++ * the service tree.

6383

++ */

6384

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6385

++			      int requeue)

6386

++{

6387

++	BUG_ON(!bfq_bfqq_busy(bfqq));

6388

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

6389

++

6390

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

6391

++

6392

++	bfq_clear_bfqq_busy(bfqq);

6393

++

6394

++	BUG_ON(bfqd->busy_queues == 0);

6395

++	bfqd->busy_queues--;

6396

++

6397

++	if (!bfqq->dispatched) {

6398

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

6399

++					&bfqd->queue_weights_tree);

6400

++		if (!blk_queue_nonrot(bfqd->queue)) {

6401

++			BUG_ON(!bfqd->busy_in_flight_queues);

6402

++			bfqd->busy_in_flight_queues--;

6403

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

6404

++				BUG_ON(!bfqd->

6405

++					const_seeky_busy_in_flight_queues);

6406

++				bfqd->const_seeky_busy_in_flight_queues--;

6407

++			}

6408

++		}

6409

++	}

6410

++	if (bfqq->wr_coeff > 1)

6411

++		bfqd->wr_busy_queues--;

6412

++

6413

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6414

++	bfqg_stats_update_dequeue(bfqq_group(bfqq));

6415

++#endif

6416

++

6417

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

6418

++}

6419

++

6420

++/*

6421

++ * Called when an inactive queue receives a new request.

6422

++ */

6423

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6424

++{

6425

++	BUG_ON(bfq_bfqq_busy(bfqq));

6426

++	BUG_ON(bfqq == bfqd->in_service_queue);

6427

++

6428

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

6429

++

6430

++	bfq_activate_bfqq(bfqd, bfqq);

6431

++

6432

++	bfq_mark_bfqq_busy(bfqq);

6433

++	bfqd->busy_queues++;

6434

++

6435

++	if (!bfqq->dispatched) {

6436

++		if (bfqq->wr_coeff == 1)

6437

++			bfq_weights_tree_add(bfqd, &bfqq->entity,

6438

++					     &bfqd->queue_weights_tree);

6439

++		if (!blk_queue_nonrot(bfqd->queue)) {

6440

++			bfqd->busy_in_flight_queues++;

6441

++			if (bfq_bfqq_constantly_seeky(bfqq))

6442

++				bfqd->const_seeky_busy_in_flight_queues++;

6443

++		}

6444

++	}

6445

++	if (bfqq->wr_coeff > 1)

6446

++		bfqd->wr_busy_queues++;

6447

++}

6448

+diff --git a/block/bfq.h b/block/bfq.h

6449

+new file mode 100644

6450

+index 0000000..485d0c9

6451

+--- /dev/null

6452

++++ b/block/bfq.h

6453

+@@ -0,0 +1,801 @@

6454

++/*

6455

++ * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes.

6456

++ *

6457

++ * Based on ideas and code from CFQ:

6458

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6459

++ *

6460

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6461

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6462

++ *

6463

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6464

++ */

6465

++

6466

++#ifndef _BFQ_H

6467

++#define _BFQ_H

6468

++

6469

++#include <linux/blktrace_api.h>

6470

++#include <linux/hrtimer.h>

6471

++#include <linux/ioprio.h>

6472

++#include <linux/rbtree.h>

6473

++#include <linux/blk-cgroup.h>

6474

++

6475

++#define BFQ_IOPRIO_CLASSES	3

6476

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

6477

++

6478

++#define BFQ_MIN_WEIGHT			1

6479

++#define BFQ_MAX_WEIGHT			1000

6480

++#define BFQ_WEIGHT_CONVERSION_COEFF	10

6481

++

6482

++#define BFQ_DEFAULT_QUEUE_IOPRIO	4

6483

++

6484

++#define BFQ_DEFAULT_GRP_WEIGHT	10

6485

++#define BFQ_DEFAULT_GRP_IOPRIO	0

6486

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

6487

++

6488

++struct bfq_entity;

6489

++

6490

++/**

6491

++ * struct bfq_service_tree - per ioprio_class service tree.

6492

++ * @active: tree for active entities (i.e., those backlogged).

6493

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

6494

++ * @first_idle: idle entity with minimum F_i.

6495

++ * @last_idle: idle entity with maximum F_i.

6496

++ * @vtime: scheduler virtual time.

6497

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

6498

++ *

6499

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

6500

++ * ioprio_class has its own independent scheduler, and so its own

6501

++ * bfq_service_tree.  All the fields are protected by the queue lock

6502

++ * of the containing bfqd.

6503

++ */

6504

++struct bfq_service_tree {

6505

++	struct rb_root active;

6506

++	struct rb_root idle;

6507

++

6508

++	struct bfq_entity *first_idle;

6509

++	struct bfq_entity *last_idle;

6510

++

6511

++	u64 vtime;

6512

++	unsigned long wsum;

6513

++};

6514

++

6515

++/**

6516

++ * struct bfq_sched_data - multi-class scheduler.

6517

++ * @in_service_entity: entity in service.

6518

++ * @next_in_service: head-of-the-line entity in the scheduler.

6519

++ * @service_tree: array of service trees, one per ioprio_class.

6520

++ *

6521

++ * bfq_sched_data is the basic scheduler queue.  It supports three

6522

++ * ioprio_classes, and can be used either as a toplevel queue or as

6523

++ * an intermediate queue on a hierarchical setup.

6524

++ * @next_in_service points to the active entity of the sched_data

6525

++ * service trees that will be scheduled next.

6526

++ *

6527

++ * The supported ioprio_classes are the same as in CFQ, in descending

6528

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

6529

++ * Requests from higher priority queues are served before all the

6530

++ * requests from lower priority queues; among requests of the same

6531

++ * queue requests are served according to B-WF2Q+.

6532

++ * All the fields are protected by the queue lock of the containing bfqd.

6533

++ */

6534

++struct bfq_sched_data {

6535

++	struct bfq_entity *in_service_entity;

6536

++	struct bfq_entity *next_in_service;

6537

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

6538

++};

6539

++

6540

++/**

6541

++ * struct bfq_weight_counter - counter of the number of all active entities

6542

++ *                             with a given weight.

6543

++ * @weight: weight of the entities that this counter refers to.

6544

++ * @num_active: number of active entities with this weight.

6545

++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

6546

++ *                and @group_weights_tree).

6547

++ */

6548

++struct bfq_weight_counter {

6549

++	short int weight;

6550

++	unsigned int num_active;

6551

++	struct rb_node weights_node;

6552

++};

6553

++

6554

++/**

6555

++ * struct bfq_entity - schedulable entity.

6556

++ * @rb_node: service_tree member.

6557

++ * @weight_counter: pointer to the weight counter associated with this entity.

6558

++ * @on_st: flag, true if the entity is on a tree (either the active or

6559

++ *         the idle one of its service_tree).

6560

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

6561

++ * @start: B-WF2Q+ start timestamp (aka S_i).

6562

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

6563

++ * @min_start: minimum start time of the (active) subtree rooted at

6564

++ *             this entity; used for O(log N) lookups into active trees.

6565

++ * @service: service received during the last round of service.

6566

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

6567

++ * @weight: weight of the queue

6568

++ * @parent: parent entity, for hierarchical scheduling.

6569

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

6570

++ *                 associated scheduler queue, %NULL on leaf nodes.

6571

++ * @sched_data: the scheduler queue this entity belongs to.

6572

++ * @ioprio: the ioprio in use.

6573

++ * @new_weight: when a weight change is requested, the new weight value.

6574

++ * @orig_weight: original weight, used to implement weight boosting

6575

++ * @prio_changed: flag, true when the user requested a weight, ioprio or

6576

++ *		  ioprio_class change.

6577

++ *

6578

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

6579

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

6580

++ * entity belongs to the sched_data of the parent group in the cgroup

6581

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

6582

++ * in @my_sched_data.

6583

++ *

6584

++ * Each entity stores independently its priority values; this would

6585

++ * allow different weights on different devices, but this

6586

++ * functionality is not exported to userspace by now.  Priorities and

6587

++ * weights are updated lazily, first storing the new values into the

6588

++ * new_* fields, then setting the @prio_changed flag.  As soon as

6589

++ * there is a transition in the entity state that allows the priority

6590

++ * update to take place the effective and the requested priority

6591

++ * values are synchronized.

6592

++ *

6593

++ * Unless cgroups are used, the weight value is calculated from the

6594

++ * ioprio to export the same interface as CFQ.  When dealing with

6595

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

6596

++ * time to consume their budget and have true sequential behavior, and

6597

++ * when there are no external factors breaking anticipation) the

6598

++ * relative weights at each level of the cgroups hierarchy should be

6599

++ * guaranteed.  All the fields are protected by the queue lock of the

6600

++ * containing bfqd.

6601

++ */

6602

++struct bfq_entity {

6603

++	struct rb_node rb_node;

6604

++	struct bfq_weight_counter *weight_counter;

6605

++

6606

++	int on_st;

6607

++

6608

++	u64 finish;

6609

++	u64 start;

6610

++

6611

++	struct rb_root *tree;

6612

++

6613

++	u64 min_start;

6614

++

6615

++	int service, budget;

6616

++	unsigned short weight, new_weight;

6617

++	unsigned short orig_weight;

6618

++

6619

++	struct bfq_entity *parent;

6620

++

6621

++	struct bfq_sched_data *my_sched_data;

6622

++	struct bfq_sched_data *sched_data;

6623

++

6624

++	int prio_changed;

6625

++};

6626

++

6627

++struct bfq_group;

6628

++

6629

++/**

6630

++ * struct bfq_queue - leaf schedulable entity.

6631

++ * @ref: reference counter.

6632

++ * @bfqd: parent bfq_data.

6633

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

6634

++ * @ioprio_class: the ioprio_class in use.

6635

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

6636

++ *                    ioprio_class value.

6637

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

6638

++ *           one or more other queues.

6639

++ * @sort_list: sorted list of pending requests.

6640

++ * @next_rq: if fifo isn't expired, next request to serve.

6641

++ * @queued: nr of requests queued in @sort_list.

6642

++ * @allocated: currently allocated requests.

6643

++ * @meta_pending: pending metadata requests.

6644

++ * @fifo: fifo list of requests in sort_list.

6645

++ * @entity: entity representing this queue in the scheduler.

6646

++ * @max_budget: maximum budget allowed from the feedback mechanism.

6647

++ * @budget_timeout: budget expiration (in jiffies).

6648

++ * @dispatched: number of requests on the dispatch list or inside driver.

6649

++ * @flags: status flags.

6650

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

6651

++ * @burst_list_node: node for the device's burst list.

6652

++ * @seek_samples: number of seeks sampled

6653

++ * @seek_total: sum of the distances of the seeks sampled

6654

++ * @seek_mean: mean seek distance

6655

++ * @last_request_pos: position of the last request enqueued

6656

++ * @requests_within_timer: number of consecutive pairs of request completion

6657

++ *                         and arrival, such that the queue becomes idle

6658

++ *                         after the completion, but the next request arrives

6659

++ *                         within an idle time slice; used only if the queue's

6660

++ *                         IO_bound has been cleared.

6661

++ * @pid: pid of the process owning the queue, used for logging purposes.

6662

++ * @last_wr_start_finish: start time of the current weight-raising period if

6663

++ *                        the @bfq-queue is being weight-raised, otherwise

6664

++ *                        finish time of the last weight-raising period

6665

++ * @wr_cur_max_time: current max raising time for this queue

6666

++ * @soft_rt_next_start: minimum time instant such that, only if a new

6667

++ *                      request is enqueued after this time instant in an

6668

++ *                      idle @bfq_queue with no outstanding requests, then

6669

++ *                      the task associated with the queue it is deemed as

6670

++ *                      soft real-time (see the comments to the function

6671

++ *                      bfq_bfqq_softrt_next_start())

6672

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

6673

++ *                      idle to backlogged

6674

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

6675

++ *                           since the last transition from idle to

6676

++ *                           backlogged

6677

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

6678

++ *	 queue is shared

6679

++ *

6680

++ * A bfq_queue is a leaf request queue; it can be associated with an

6681

++ * io_context or more, if it  is  async or shared  between  cooperating

6682

++ * processes. @cgroup holds a reference to the cgroup, to be sure that it

6683

++ * does not disappear while a bfqq still references it (mostly to avoid

6684

++ * races between request issuing and task migration followed by cgroup

6685

++ * destruction).

6686

++ * All the fields are protected by the queue lock of the containing bfqd.

6687

++ */

6688

++struct bfq_queue {

6689

++	atomic_t ref;

6690

++	struct bfq_data *bfqd;

6691

++

6692

++	unsigned short ioprio, new_ioprio;

6693

++	unsigned short ioprio_class, new_ioprio_class;

6694

++

6695

++	/* fields for cooperating queues handling */

6696

++	struct bfq_queue *new_bfqq;

6697

++	struct rb_node pos_node;

6698

++	struct rb_root *pos_root;

6699

++

6700

++	struct rb_root sort_list;

6701

++	struct request *next_rq;

6702

++	int queued[2];

6703

++	int allocated[2];

6704

++	int meta_pending;

6705

++	struct list_head fifo;

6706

++

6707

++	struct bfq_entity entity;

6708

++

6709

++	int max_budget;

6710

++	unsigned long budget_timeout;

6711

++

6712

++	int dispatched;

6713

++

6714

++	unsigned int flags;

6715

++

6716

++	struct list_head bfqq_list;

6717

++

6718

++	struct hlist_node burst_list_node;

6719

++

6720

++	unsigned int seek_samples;

6721

++	u64 seek_total;

6722

++	sector_t seek_mean;

6723

++	sector_t last_request_pos;

6724

++

6725

++	unsigned int requests_within_timer;

6726

++

6727

++	pid_t pid;

6728

++	struct bfq_io_cq *bic;

6729

++

6730

++	/* weight-raising fields */

6731

++	unsigned long wr_cur_max_time;

6732

++	unsigned long soft_rt_next_start;

6733

++	unsigned long last_wr_start_finish;

6734

++	unsigned int wr_coeff;

6735

++	unsigned long last_idle_bklogged;

6736

++	unsigned long service_from_backlogged;

6737

++};

6738

++

6739

++/**

6740

++ * struct bfq_ttime - per process thinktime stats.

6741

++ * @ttime_total: total process thinktime

6742

++ * @ttime_samples: number of thinktime samples

6743

++ * @ttime_mean: average process thinktime

6744

++ */

6745

++struct bfq_ttime {

6746

++	unsigned long last_end_request;

6747

++

6748

++	unsigned long ttime_total;

6749

++	unsigned long ttime_samples;

6750

++	unsigned long ttime_mean;

6751

++};

6752

++

6753

++/**

6754

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

6755

++ * @icq: associated io_cq structure

6756

++ * @bfqq: array of two process queues, the sync and the async

6757

++ * @ttime: associated @bfq_ttime struct

6758

++ * @ioprio: per (request_queue, blkcg) ioprio.

6759

++ * @blkcg_id: id of the blkcg the related io_cq belongs to.

6760

++ */

6761

++struct bfq_io_cq {

6762

++	struct io_cq icq; /* must be the first member */

6763

++	struct bfq_queue *bfqq[2];

6764

++	struct bfq_ttime ttime;

6765

++	int ioprio;

6766

++

6767

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6768

++	uint64_t blkcg_id; /* the current blkcg ID */

6769

++#endif

6770

++};

6771

++

6772

++enum bfq_device_speed {

6773

++	BFQ_BFQD_FAST,

6774

++	BFQ_BFQD_SLOW,

6775

++};

6776

++

6777

++/**

6778

++ * struct bfq_data - per device data structure.

6779

++ * @queue: request queue for the managed device.

6780

++ * @root_group: root bfq_group for the device.

6781

++ * @active_numerous_groups: number of bfq_groups containing more than one

6782

++ *                          active @bfq_entity.

6783

++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

6784

++ *                      weight. Used to keep track of whether all @bfq_queues

6785

++ *                     have the same weight. The tree contains one counter

6786

++ *                     for each distinct weight associated to some active

6787

++ *                     and not weight-raised @bfq_queue (see the comments to

6788

++ *                      the functions bfq_weights_tree_[add|remove] for

6789

++ *                     further details).

6790

++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

6791

++ *                      by weight. Used to keep track of whether all

6792

++ *                     @bfq_groups have the same weight. The tree contains

6793

++ *                     one counter for each distinct weight associated to

6794

++ *                     some active @bfq_group (see the comments to the

6795

++ *                     functions bfq_weights_tree_[add|remove] for further

6796

++ *                     details).

6797

++ * @busy_queues: number of bfq_queues containing requests (including the

6798

++ *		 queue in service, even if it is idling).

6799

++ * @busy_in_flight_queues: number of @bfq_queues containing pending or

6800

++ *                         in-flight requests, plus the @bfq_queue in

6801

++ *                         service, even if idle but waiting for the

6802

++ *                         possible arrival of its next sync request. This

6803

++ *                         field is updated only if the device is rotational,

6804

++ *                         but used only if the device is also NCQ-capable.

6805

++ *                         The reason why the field is updated also for non-

6806

++ *                         NCQ-capable rotational devices is related to the

6807

++ *                         fact that the value of @hw_tag may be set also

6808

++ *                         later than when busy_in_flight_queues may need to

6809

++ *                         be incremented for the first time(s). Taking also

6810

++ *                         this possibility into account, to avoid unbalanced

6811

++ *                         increments/decrements, would imply more overhead

6812

++ *                         than just updating busy_in_flight_queues

6813

++ *                         regardless of the value of @hw_tag.

6814

++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

6815

++ *                                     (that is, seeky queues that expired

6816

++ *                                     for budget timeout at least once)

6817

++ *                                     containing pending or in-flight

6818

++ *                                     requests, including the in-service

6819

++ *                                     @bfq_queue if constantly seeky. This

6820

++ *                                     field is updated only if the device

6821

++ *                                     is rotational, but used only if the

6822

++ *                                     device is also NCQ-capable (see the

6823

++ *                                     comments to @busy_in_flight_queues).

6824

++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.

6825

++ * @queued: number of queued requests.

6826

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

6827

++ * @sync_flight: number of sync requests in the driver.

6828

++ * @max_rq_in_driver: max number of reqs in driver in the last

6829

++ *                    @hw_tag_samples completed requests.

6830

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

6831

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

6832

++ * @budgets_assigned: number of budgets assigned.

6833

++ * @idle_slice_timer: timer set when idling for the next sequential request

6834

++ *                    from the queue in service.

6835

++ * @unplug_work: delayed work to restart dispatching on the request queue.

6836

++ * @in_service_queue: bfq_queue in service.

6837

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

6838

++ * @last_position: on-disk position of the last served request.

6839

++ * @last_budget_start: beginning of the last budget.

6840

++ * @last_idling_start: beginning of the last idle slice.

6841

++ * @peak_rate: peak transfer rate observed for a budget.

6842

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

6843

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before

6844

++ *                  rescheduling.

6845

++ * @active_list: list of all the bfq_queues active on the device.

6846

++ * @idle_list: list of all the bfq_queues idle on the device.

6847

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

6848

++ *                   requests are served in fifo order.

6849

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

6850

++ * @bfq_back_max: maximum allowed backward seek.

6851

++ * @bfq_slice_idle: maximum idling time.

6852

++ * @bfq_user_max_budget: user-configured max budget value

6853

++ *                       (0 for auto-tuning).

6854

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

6855

++ *                           async queues.

6856

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

6857

++ *               to prevent seeky queues to impose long latencies to well

6858

++ *               behaved ones (this also implies that seeky queues cannot

6859

++ *               receive guarantees in the service domain; after a timeout

6860

++ *               they are charged for the whole allocated budget, to try

6861

++ *               to preserve a behavior reasonably fair among them, but

6862

++ *               without service-domain guarantees).

6863

++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

6864

++ *                   no more granted any weight-raising.

6865

++ * @bfq_failed_cooperations: number of consecutive failed cooperation

6866

++ *                           chances after which weight-raising is restored

6867

++ *                           to a queue subject to more than bfq_coop_thresh

6868

++ *                           queue merges.

6869

++ * @bfq_requests_within_timer: number of consecutive requests that must be

6870

++ *                             issued within the idle time slice to set

6871

++ *                             again idling to a queue which was marked as

6872

++ *                             non-I/O-bound (see the definition of the

6873

++ *                             IO_bound flag for further details).

6874

++ * @last_ins_in_burst: last time at which a queue entered the current

6875

++ *                     burst of queues being activated shortly after

6876

++ *                     each other; for more details about this and the

6877

++ *                     following parameters related to a burst of

6878

++ *                     activations, see the comments to the function

6879

++ *                     @bfq_handle_burst.

6880

++ * @bfq_burst_interval: reference time interval used to decide whether a

6881

++ *                      queue has been activated shortly after

6882

++ *                      @last_ins_in_burst.

6883

++ * @burst_size: number of queues in the current burst of queue activations.

6884

++ * @bfq_large_burst_thresh: maximum burst size above which the current

6885

++ * 			    queue-activation burst is deemed as 'large'.

6886

++ * @large_burst: true if a large queue-activation burst is in progress.

6887

++ * @burst_list: head of the burst list (as for the above fields, more details

6888

++ * 		in the comments to the function bfq_handle_burst).

6889

++ * @low_latency: if set to true, low-latency heuristics are enabled.

6890

++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

6891

++ *                queue is multiplied.

6892

++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

6893

++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

6894

++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

6895

++ *			  may be reactivated for a queue (in jiffies).

6896

++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

6897

++ *				after which weight-raising may be

6898

++ *				reactivated for an already busy queue

6899

++ *				(in jiffies).

6900

++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

6901

++ *			    sectors per seconds.

6902

++ * @RT_prod: cached value of the product R*T used for computing the maximum

6903

++ *	     duration of the weight raising automatically.

6904

++ * @device_speed: device-speed class for the low-latency heuristic.

6905

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

6906

++ *

6907

++ * All the fields are protected by the @queue lock.

6908

++ */

6909

++struct bfq_data {

6910

++	struct request_queue *queue;

6911

++

6912

++	struct bfq_group *root_group;

6913

++

6914

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6915

++	int active_numerous_groups;

6916

++#endif

6917

++

6918

++	struct rb_root queue_weights_tree;

6919

++	struct rb_root group_weights_tree;

6920

++

6921

++	int busy_queues;

6922

++	int busy_in_flight_queues;

6923

++	int const_seeky_busy_in_flight_queues;

6924

++	int wr_busy_queues;

6925

++	int queued;

6926

++	int rq_in_driver;

6927

++	int sync_flight;

6928

++

6929

++	int max_rq_in_driver;

6930

++	int hw_tag_samples;

6931

++	int hw_tag;

6932

++

6933

++	int budgets_assigned;

6934

++

6935

++	struct timer_list idle_slice_timer;

6936

++	struct work_struct unplug_work;

6937

++

6938

++	struct bfq_queue *in_service_queue;

6939

++	struct bfq_io_cq *in_service_bic;

6940

++

6941

++	sector_t last_position;

6942

++

6943

++	ktime_t last_budget_start;

6944

++	ktime_t last_idling_start;

6945

++	int peak_rate_samples;

6946

++	u64 peak_rate;

6947

++	int bfq_max_budget;

6948

++

6949

++	struct list_head active_list;

6950

++	struct list_head idle_list;

6951

++

6952

++	unsigned int bfq_fifo_expire[2];

6953

++	unsigned int bfq_back_penalty;

6954

++	unsigned int bfq_back_max;

6955

++	unsigned int bfq_slice_idle;

6956

++	u64 bfq_class_idle_last_service;

6957

++

6958

++	int bfq_user_max_budget;

6959

++	int bfq_max_budget_async_rq;

6960

++	unsigned int bfq_timeout[2];

6961

++

6962

++	unsigned int bfq_coop_thresh;

6963

++	unsigned int bfq_failed_cooperations;

6964

++	unsigned int bfq_requests_within_timer;

6965

++

6966

++	unsigned long last_ins_in_burst;

6967

++	unsigned long bfq_burst_interval;

6968

++	int burst_size;

6969

++	unsigned long bfq_large_burst_thresh;

6970

++	bool large_burst;

6971

++	struct hlist_head burst_list;

6972

++

6973

++	bool low_latency;

6974

++

6975

++	/* parameters of the low_latency heuristics */

6976

++	unsigned int bfq_wr_coeff;

6977

++	unsigned int bfq_wr_max_time;

6978

++	unsigned int bfq_wr_rt_max_time;

6979

++	unsigned int bfq_wr_min_idle_time;

6980

++	unsigned long bfq_wr_min_inter_arr_async;

6981

++	unsigned int bfq_wr_max_softrt_rate;

6982

++	u64 RT_prod;

6983

++	enum bfq_device_speed device_speed;

6984

++

6985

++	struct bfq_queue oom_bfqq;

6986

++};

6987

++

6988

++enum bfqq_state_flags {

6989

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

6990

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6991

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6992

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6993

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6994

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6995

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6996

++	BFQ_BFQQ_FLAG_IO_bound,		/*

6997

++					 * bfqq has timed-out at least once

6998

++					 * having consumed at most 2/10 of

6999

++					 * its budget

7000

++					 */

7001

++	BFQ_BFQQ_FLAG_in_large_burst,	/*

7002

++					 * bfqq activated in a large burst,

7003

++					 * see comments to bfq_handle_burst.

7004

++					 */

7005

++	BFQ_BFQQ_FLAG_constantly_seeky,	/*

7006

++					 * bfqq has proved to be slow and

7007

++					 * seeky until budget timeout

7008

++					 */

7009

++	BFQ_BFQQ_FLAG_softrt_update,	/*

7010

++					 * may need softrt-next-start

7011

++					 * update

7012

++					 */

7013

++};

7014

++

7015

++#define BFQ_BFQQ_FNS(name)						\

7016

++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

7017

++{									\

7018

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

7019

++}									\

7020

++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\

7021

++{									\

7022

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

7023

++}									\

7024

++static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

7025

++{									\

7026

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

7027

++}

7028

++

7029

++BFQ_BFQQ_FNS(busy);

7030

++BFQ_BFQQ_FNS(wait_request);

7031

++BFQ_BFQQ_FNS(must_alloc);

7032

++BFQ_BFQQ_FNS(fifo_expire);

7033

++BFQ_BFQQ_FNS(idle_window);

7034

++BFQ_BFQQ_FNS(sync);

7035

++BFQ_BFQQ_FNS(budget_new);

7036

++BFQ_BFQQ_FNS(IO_bound);

7037

++BFQ_BFQQ_FNS(in_large_burst);

7038

++BFQ_BFQQ_FNS(constantly_seeky);

7039

++BFQ_BFQQ_FNS(softrt_update);

7040

++#undef BFQ_BFQQ_FNS

7041

++

7042

++/* Logging facilities. */

7043

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

7044

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

7045

++

7046

++#define bfq_log(bfqd, fmt, args...) \

7047

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

7048

++

7049

++/* Expiration reasons. */

7050

++enum bfqq_expiration {

7051

++	BFQ_BFQQ_TOO_IDLE = 0,		/*

7052

++					 * queue has been idling for

7053

++					 * too long

7054

++					 */

7055

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

7056

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

7057

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

7058

++};

7059

++

7060

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

7061

++

7062

++struct bfqg_stats {

7063

++	/* total bytes transferred */

7064

++	struct blkg_rwstat		service_bytes;

7065

++	/* total IOs serviced, post merge */

7066

++	struct blkg_rwstat		serviced;

7067

++	/* number of ios merged */

7068

++	struct blkg_rwstat		merged;

7069

++	/* total time spent on device in ns, may not be accurate w/ queueing */

7070

++	struct blkg_rwstat		service_time;

7071

++	/* total time spent waiting in scheduler queue in ns */

7072

++	struct blkg_rwstat		wait_time;

7073

++	/* number of IOs queued up */

7074

++	struct blkg_rwstat		queued;

7075

++	/* total sectors transferred */

7076

++	struct blkg_stat		sectors;

7077

++	/* total disk time and nr sectors dispatched by this group */

7078

++	struct blkg_stat		time;

7079

++	/* time not charged to this cgroup */

7080

++	struct blkg_stat		unaccounted_time;

7081

++	/* sum of number of ios queued across all samples */

7082

++	struct blkg_stat		avg_queue_size_sum;

7083

++	/* count of samples taken for average */

7084

++	struct blkg_stat		avg_queue_size_samples;

7085

++	/* how many times this group has been removed from service tree */

7086

++	struct blkg_stat		dequeue;

7087

++	/* total time spent waiting for it to be assigned a timeslice. */

7088

++	struct blkg_stat		group_wait_time;

7089

++	/* time spent idling for this blkcg_gq */

7090

++	struct blkg_stat		idle_time;

7091

++	/* total time with empty current active q with other requests queued */

7092

++	struct blkg_stat		empty_time;

7093

++	/* fields after this shouldn't be cleared on stat reset */

7094

++	uint64_t			start_group_wait_time;

7095

++	uint64_t			start_idle_time;

7096

++	uint64_t			start_empty_time;

7097

++	uint16_t			flags;

7098

++};

7099

++

7100

++/*

7101

++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.

7102

++ *

7103

++ * @ps: @blkcg_policy_storage that this structure inherits

7104

++ * @weight: weight of the bfq_group

7105

++ */

7106

++struct bfq_group_data {

7107

++	/* must be the first member */

7108

++	struct blkcg_policy_data pd;

7109

++

7110

++	unsigned short weight;

7111

++};

7112

++

7113

++/**

7114

++ * struct bfq_group - per (device, cgroup) data structure.

7115

++ * @entity: schedulable entity to insert into the parent group sched_data.

7116

++ * @sched_data: own sched_data, to contain child entities (they may be

7117

++ *              both bfq_queues and bfq_groups).

7118

++ * @bfqd: the bfq_data for the device this group acts upon.

7119

++ * @async_bfqq: array of async queues for all the tasks belonging to

7120

++ *              the group, one queue per ioprio value per ioprio_class,

7121

++ *              except for the idle class that has only one queue.

7122

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

7123

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

7124

++ *             to avoid too many special cases during group creation/

7125

++ *             migration.

7126

++ * @active_entities: number of active entities belonging to the group;

7127

++ *                   unused for the root group. Used to know whether there

7128

++ *                   are groups with more than one active @bfq_entity

7129

++ *                   (see the comments to the function

7130

++ *                   bfq_bfqq_must_not_expire()).

7131

++ *

7132

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

7133

++ * there is a set of bfq_groups, each one collecting the lower-level

7134

++ * entities belonging to the group that are acting on the same device.

7135

++ *

7136

++ * Locking works as follows:

7137

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

7138

++ *      from the readers.

7139

++ *    o All the other fields are protected by the @bfqd queue lock.

7140

++ */

7141

++struct bfq_group {

7142

++	/* must be the first member */

7143

++	struct blkg_policy_data pd;

7144

++

7145

++	struct bfq_entity entity;

7146

++	struct bfq_sched_data sched_data;

7147

++

7148

++	void *bfqd;

7149

++

7150

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7151

++	struct bfq_queue *async_idle_bfqq;

7152

++

7153

++	struct bfq_entity *my_entity;

7154

++

7155

++	int active_entities;

7156

++

7157

++	struct bfqg_stats stats;

7158

++	struct bfqg_stats dead_stats;	/* stats pushed from dead children */

7159

++};

7160

++

7161

++#else

7162

++struct bfq_group {

7163

++	struct bfq_sched_data sched_data;

7164

++

7165

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7166

++	struct bfq_queue *async_idle_bfqq;

7167

++};

7168

++#endif

7169

++

7170

++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);

7171

++

7172

++static struct bfq_service_tree *

7173

++bfq_entity_service_tree(struct bfq_entity *entity)

7174

++{

7175

++	struct bfq_sched_data *sched_data = entity->sched_data;

7176

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

7177

++	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :

7178

++				  BFQ_DEFAULT_GRP_CLASS;

7179

++

7180

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

7181

++	BUG_ON(sched_data == NULL);

7182

++

7183

++	return sched_data->service_tree + idx;

7184

++}

7185

++

7186

++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)

7187

++{

7188

++	return bic->bfqq[is_sync];

7189

++}

7190

++

7191

++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,

7192

++			 bool is_sync)

7193

++{

7194

++	bic->bfqq[is_sync] = bfqq;

7195

++}

7196

++

7197

++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

7198

++{

7199

++	return bic->icq.q->elevator->elevator_data;

7200

++}

7201

++

7202

++/**

7203

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

7204

++ * @ptr: a pointer to a bfqd.

7205

++ * @flags: storage for the flags to be saved.

7206

++ *

7207

++ * This function allows bfqg->bfqd to be protected by the

7208

++ * queue lock of the bfqd they reference; the pointer is dereferenced

7209

++ * under RCU, so the storage for bfqd is assured to be safe as long

7210

++ * as the RCU read side critical section does not end.  After the

7211

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

7212

++ * sure that no other writer accessed it.  If we raced with a writer,

7213

++ * the function returns NULL, with the queue unlocked, otherwise it

7214

++ * returns the dereferenced pointer, with the queue locked.

7215

++ */

7216

++static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)

7217

++{

7218

++	struct bfq_data *bfqd;

7219

++

7220

++	rcu_read_lock();

7221

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

7222

++

7223

++	if (bfqd != NULL) {

7224

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

7225

++		if (ptr == NULL)

7226

++			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");

7227

++		else if (*ptr == bfqd)

7228

++			goto out;

7229

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7230

++	}

7231

++

7232

++	bfqd = NULL;

7233

++out:

7234

++	rcu_read_unlock();

7235

++	return bfqd;

7236

++}

7237

++

7238

++static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)

7239

++{

7240

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7241

++}

7242

++

7243

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);

7244

++static void bfq_put_queue(struct bfq_queue *bfqq);

7245

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

7246

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

7247

++				       struct bio *bio, int is_sync,

7248

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

7249

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

7250

++				    struct bfq_group *bfqg);

7251

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

7252

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7253

++

7254

++#endif /* _BFQ_H */

7255

+--

7256

+1.9.1

7257

+

7258

7259

diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch

7260

new file mode 100644

7261

index 0000000..eb23acc

7262

--- /dev/null

7263

+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch

7264

@@ -0,0 +1,1101 @@

7265

+From d93e55da4df8c5e7c33379780ad7d2fdb02e0568 Mon Sep 17 00:00:00 2001

7266

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

7267

+Date: Sun, 6 Sep 2015 16:09:05 +0200

7268

+Subject: [PATCH 3/4] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for

7269

+ 4.7.0

7270

+

7271

+A set of processes may happen  to  perform interleaved reads, i.e.,requests

7272

+whose union would give rise to a  sequential read  pattern.  There are two

7273

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

7274

+data at a fixed distance from each other, while in the second case processes

7275

+may read variable-size chunks at  variable distances. The latter case occurs

7276

+for  example with  QEMU, which  splits the  I/O generated  by the  guest into

7277

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

7278

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

7279

+available  process. CFQ  uses actual  queue merging  for the  first type of

7280

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

7281

+of the read requests  performed by the second type of  processes. In the end

7282

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

7283

+throughput with interleaved I/O.

7284

+

7285

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

7286

+sequential  read pattern  with both  types of  processes. The  main idea is

7287

+checking newly arrived requests against the next request of the active queue

7288

+both in case of actual request insert and in case of request merge. By doing

7289

+so, both the types of processes can be handled by just merging their queues.

7290

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

7291

+CFQ.

7292

+

7293

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

7294

+properly restoring the weight-raising state of a queue when it gets back to

7295

+a non-merged state.

7296

+

7297

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

7298

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

7299

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7300

+Signed-off-by: Linus Walleij <linus.walleij@××××××.org>

7301

+---

7302

+ block/bfq-cgroup.c  |   4 +

7303

+ block/bfq-iosched.c | 687 ++++++++++++++++++++++++++++++++++++++++++++++++++--

7304

+ block/bfq.h         |  66 +++++

7305

+ 3 files changed, 743 insertions(+), 14 deletions(-)

7306

+

7307

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

7308

+index 8610cd6..5ee99ec 100644

7309

+--- a/block/bfq-cgroup.c

7310

++++ b/block/bfq-cgroup.c

7311

+@@ -437,6 +437,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)

7312

+ 				   */

7313

+ 	bfqg->bfqd = bfqd;

7314

+ 	bfqg->active_entities = 0;

7315

++	bfqg->rq_pos_tree = RB_ROOT;

7316

+ }

7317

+

7318

+ static void bfq_pd_free(struct blkg_policy_data *pd)

7319

+@@ -530,6 +531,8 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

7320

+ 	return bfqg;

7321

+ }

7322

+

7323

++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7324

++

7325

+ /**

7326

+  * bfq_bfqq_move - migrate @bfqq to @bfqg.

7327

+  * @bfqd: queue descriptor.

7328

+@@ -577,6 +580,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7329

+ 	bfqg_get(bfqg);

7330

+

7331

+ 	if (busy) {

7332

++		bfq_pos_tree_add_move(bfqd, bfqq);

7333

+ 		if (resume)

7334

+ 			bfq_activate_bfqq(bfqd, bfqq);

7335

+ 	}

7336

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7337

+index f9787a6..d1f648d 100644

7338

+--- a/block/bfq-iosched.c

7339

++++ b/block/bfq-iosched.c

7340

+@@ -296,6 +296,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,

7341

+ 	}

7342

+ }

7343

+

7344

++static struct bfq_queue *

7345

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

7346

++		     sector_t sector, struct rb_node **ret_parent,

7347

++		     struct rb_node ***rb_link)

7348

++{

7349

++	struct rb_node **p, *parent;

7350

++	struct bfq_queue *bfqq = NULL;

7351

++

7352

++	parent = NULL;

7353

++	p = &root->rb_node;

7354

++	while (*p) {

7355

++		struct rb_node **n;

7356

++

7357

++		parent = *p;

7358

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7359

++

7360

++		/*

7361

++		 * Sort strictly based on sector. Smallest to the left,

7362

++		 * largest to the right.

7363

++		 */

7364

++		if (sector > blk_rq_pos(bfqq->next_rq))

7365

++			n = &(*p)->rb_right;

7366

++		else if (sector < blk_rq_pos(bfqq->next_rq))

7367

++			n = &(*p)->rb_left;

7368

++		else

7369

++			break;

7370

++		p = n;

7371

++		bfqq = NULL;

7372

++	}

7373

++

7374

++	*ret_parent = parent;

7375

++	if (rb_link)

7376

++		*rb_link = p;

7377

++

7378

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

7379

++		(long long unsigned)sector,

7380

++		bfqq ? bfqq->pid : 0);

7381

++

7382

++	return bfqq;

7383

++}

7384

++

7385

++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7386

++{

7387

++	struct rb_node **p, *parent;

7388

++	struct bfq_queue *__bfqq;

7389

++

7390

++	if (bfqq->pos_root) {

7391

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

7392

++		bfqq->pos_root = NULL;

7393

++	}

7394

++

7395

++	if (bfq_class_idle(bfqq))

7396

++		return;

7397

++	if (!bfqq->next_rq)

7398

++		return;

7399

++

7400

++	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;

7401

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

7402

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

7403

++	if (!__bfqq) {

7404

++		rb_link_node(&bfqq->pos_node, parent, p);

7405

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

7406

++	} else

7407

++		bfqq->pos_root = NULL;

7408

++}

7409

++

7410

+ /*

7411

+  * Tell whether there are active queues or groups with differentiated weights.

7412

+  */

7413

+@@ -528,6 +594,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)

7414

+ 	return dur;

7415

+ }

7416

+

7417

++static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)

7418

++{

7419

++	return bfqq->bic ? bfqq->bic->cooperations : 0;

7420

++}

7421

++

7422

++static void

7423

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

7424

++{

7425

++	if (bic->saved_idle_window)

7426

++		bfq_mark_bfqq_idle_window(bfqq);

7427

++	else

7428

++		bfq_clear_bfqq_idle_window(bfqq);

7429

++	if (bic->saved_IO_bound)

7430

++		bfq_mark_bfqq_IO_bound(bfqq);

7431

++	else

7432

++		bfq_clear_bfqq_IO_bound(bfqq);

7433

++	/* Assuming that the flag in_large_burst is already correctly set */

7434

++	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

7435

++	    !bfq_bfqq_in_large_burst(bfqq) &&

7436

++	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

7437

++		/*

7438

++		 * Start a weight raising period with the duration given by

7439

++		 * the raising_time_left snapshot.

7440

++		 */

7441

++		if (bfq_bfqq_busy(bfqq))

7442

++			bfqq->bfqd->wr_busy_queues++;

7443

++		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

7444

++		bfqq->wr_cur_max_time = bic->wr_time_left;

7445

++		bfqq->last_wr_start_finish = jiffies;

7446

++		bfqq->entity.prio_changed = 1;

7447

++	}

7448

++	/*

7449

++	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

7450

++	 * getting confused about the queue's need of a weight-raising

7451

++	 * period.

7452

++	 */

7453

++	bic->wr_time_left = 0;

7454

++}

7455

++

7456

++static int bfqq_process_refs(struct bfq_queue *bfqq)

7457

++{

7458

++	int process_refs, io_refs;

7459

++

7460

++	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);

7461

++

7462

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7463

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7464

++	BUG_ON(process_refs < 0);

7465

++	return process_refs;

7466

++}

7467

++

7468

+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

7469

+ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7470

+ {

7471

+@@ -764,8 +881,14 @@ static void bfq_add_request(struct request *rq)

7472

+ 	BUG_ON(!next_rq);

7473

+ 	bfqq->next_rq = next_rq;

7474

+

7475

++	/*

7476

++	 * Adjust priority tree position, if next_rq changes.

7477

++	 */

7478

++	if (prev != bfqq->next_rq)

7479

++		bfq_pos_tree_add_move(bfqd, bfqq);

7480

++

7481

+ 	if (!bfq_bfqq_busy(bfqq)) {

7482

+-		bool soft_rt, in_burst,

7483

++		bool soft_rt, coop_or_in_burst,

7484

+ 		     idle_for_long_time = time_is_before_jiffies(

7485

+ 						bfqq->budget_timeout +

7486

+ 						bfqd->bfq_wr_min_idle_time);

7487

+@@ -793,11 +916,12 @@ static void bfq_add_request(struct request *rq)

7488

+ 				bfqd->last_ins_in_burst = jiffies;

7489

+ 		}

7490

+

7491

+-		in_burst = bfq_bfqq_in_large_burst(bfqq);

7492

++		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

7493

++			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

7494

+ 		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

7495

+-			!in_burst &&

7496

++			!coop_or_in_burst &&

7497

+ 			time_is_before_jiffies(bfqq->soft_rt_next_start);

7498

+-		interactive = !in_burst && idle_for_long_time;

7499

++		interactive = !coop_or_in_burst && idle_for_long_time;

7500

+ 		entity->budget = max_t(unsigned long, bfqq->max_budget,

7501

+ 				       bfq_serv_to_charge(next_rq, bfqq));

7502

+

7503

+@@ -816,6 +940,9 @@ static void bfq_add_request(struct request *rq)

7504

+ 		if (!bfqd->low_latency)

7505

+ 			goto add_bfqq_busy;

7506

+

7507

++		if (bfq_bfqq_just_split(bfqq))

7508

++			goto set_prio_changed;

7509

++

7510

+ 		/*

7511

+ 		 * If the queue:

7512

+ 		 * - is not being boosted,

7513

+@@ -840,7 +967,7 @@ static void bfq_add_request(struct request *rq)

7514

+ 		} else if (old_wr_coeff > 1) {

7515

+ 			if (interactive)

7516

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7517

+-			else if (in_burst ||

7518

++			else if (coop_or_in_burst ||

7519

+ 				 (bfqq->wr_cur_max_time ==

7520

+ 				  bfqd->bfq_wr_rt_max_time &&

7521

+ 				  !soft_rt)) {

7522

+@@ -905,6 +1032,7 @@ static void bfq_add_request(struct request *rq)

7523

+ 					bfqd->bfq_wr_rt_max_time;

7524

+ 			}

7525

+ 		}

7526

++set_prio_changed:

7527

+ 		if (old_wr_coeff != bfqq->wr_coeff)

7528

+ 			entity->prio_changed = 1;

7529

+ add_bfqq_busy:

7530

+@@ -1047,6 +1175,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,

7531

+ 					 bfqd->last_position);

7532

+ 		BUG_ON(!next_rq);

7533

+ 		bfqq->next_rq = next_rq;

7534

++		/*

7535

++		 * If next_rq changes, update both the queue's budget to

7536

++		 * fit the new request and the queue's position in its

7537

++		 * rq_pos_tree.

7538

++		 */

7539

++		if (prev != bfqq->next_rq) {

7540

++			bfq_updated_next_req(bfqd, bfqq);

7541

++			bfq_pos_tree_add_move(bfqd, bfqq);

7542

++		}

7543

+ 	}

7544

+ }

7545

+

7546

+@@ -1129,11 +1266,346 @@ static void bfq_end_wr(struct bfq_data *bfqd)

7547

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

7548

+ }

7549

+

7550

++static sector_t bfq_io_struct_pos(void *io_struct, bool request)

7551

++{

7552

++	if (request)

7553

++		return blk_rq_pos(io_struct);

7554

++	else

7555

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

7556

++}

7557

++

7558

++static int bfq_rq_close_to_sector(void *io_struct, bool request,

7559

++				  sector_t sector)

7560

++{

7561

++	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=

7562

++	       BFQQ_SEEK_THR;

7563

++}

7564

++

7565

++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,

7566

++					 struct bfq_queue *bfqq,

7567

++					 sector_t sector)

7568

++{

7569

++	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;

7570

++	struct rb_node *parent, *node;

7571

++	struct bfq_queue *__bfqq;

7572

++

7573

++	if (RB_EMPTY_ROOT(root))

7574

++		return NULL;

7575

++

7576

++	/*

7577

++	 * First, if we find a request starting at the end of the last

7578

++	 * request, choose it.

7579

++	 */

7580

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

7581

++	if (__bfqq)

7582

++		return __bfqq;

7583

++

7584

++	/*

7585

++	 * If the exact sector wasn't found, the parent of the NULL leaf

7586

++	 * will contain the closest sector (rq_pos_tree sorted by

7587

++	 * next_request position).

7588

++	 */

7589

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7590

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7591

++		return __bfqq;

7592

++

7593

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

7594

++		node = rb_next(&__bfqq->pos_node);

7595

++	else

7596

++		node = rb_prev(&__bfqq->pos_node);

7597

++	if (!node)

7598

++		return NULL;

7599

++

7600

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

7601

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7602

++		return __bfqq;

7603

++

7604

++	return NULL;

7605

++}

7606

++

7607

++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,

7608

++						   struct bfq_queue *cur_bfqq,

7609

++						   sector_t sector)

7610

++{

7611

++	struct bfq_queue *bfqq;

7612

++

7613

++	/*

7614

++	 * We shall notice if some of the queues are cooperating,

7615

++	 * e.g., working closely on the same area of the device. In

7616

++	 * that case, we can group them together and: 1) don't waste

7617

++	 * time idling, and 2) serve the union of their requests in

7618

++	 * the best possible order for throughput.

7619

++	 */

7620

++	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);

7621

++	if (!bfqq || bfqq == cur_bfqq)

7622

++		return NULL;

7623

++

7624

++	return bfqq;

7625

++}

7626

++

7627

++static struct bfq_queue *

7628

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7629

++{

7630

++	int process_refs, new_process_refs;

7631

++	struct bfq_queue *__bfqq;

7632

++

7633

++	/*

7634

++	 * If there are no process references on the new_bfqq, then it is

7635

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7636

++	 * may have dropped their last reference (not just their last process

7637

++	 * reference).

7638

++	 */

7639

++	if (!bfqq_process_refs(new_bfqq))

7640

++		return NULL;

7641

++

7642

++	/* Avoid a circular list and skip interim queue merges. */

7643

++	while ((__bfqq = new_bfqq->new_bfqq)) {

7644

++		if (__bfqq == bfqq)

7645

++			return NULL;

7646

++		new_bfqq = __bfqq;

7647

++	}

7648

++

7649

++	process_refs = bfqq_process_refs(bfqq);

7650

++	new_process_refs = bfqq_process_refs(new_bfqq);

7651

++	/*

7652

++	 * If the process for the bfqq has gone away, there is no

7653

++	 * sense in merging the queues.

7654

++	 */

7655

++	if (process_refs == 0 || new_process_refs == 0)

7656

++		return NULL;

7657

++

7658

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7659

++		new_bfqq->pid);

7660

++

7661

++	/*

7662

++	 * Merging is just a redirection: the requests of the process

7663

++	 * owning one of the two queues are redirected to the other queue.

7664

++	 * The latter queue, in its turn, is set as shared if this is the

7665

++	 * first time that the requests of some process are redirected to

7666

++	 * it.

7667

++	 *

7668

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

7669

++	 * are in the context of the process owning bfqq, hence we have

7670

++	 * the io_cq of this process. So we can immediately configure this

7671

++	 * io_cq to redirect the requests of the process to new_bfqq.

7672

++	 *

7673

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

7674

++	 * io_cq of new_bfqq is not available, because, if the in-service

7675

++	 * queue is shared, bfqd->in_service_bic may not point to the

7676

++	 * io_cq of the in-service queue.

7677

++	 * Redirecting the requests of the process owning bfqq to the

7678

++	 * currently in-service queue is in any case the best option, as

7679

++	 * we feed the in-service queue with new requests close to the

7680

++	 * last request served and, by doing so, hopefully increase the

7681

++	 * throughput.

7682

++	 */

7683

++	bfqq->new_bfqq = new_bfqq;

7684

++	atomic_add(process_refs, &new_bfqq->ref);

7685

++	return new_bfqq;

7686

++}

7687

++

7688

++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,

7689

++					struct bfq_queue *new_bfqq)

7690

++{

7691

++	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||

7692

++	    (bfqq->ioprio_class != new_bfqq->ioprio_class))

7693

++		return false;

7694

++

7695

++	/*

7696

++	 * If either of the queues has already been detected as seeky,

7697

++	 * then merging it with the other queue is unlikely to lead to

7698

++	 * sequential I/O.

7699

++	 */

7700

++	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))

7701

++		return false;

7702

++

7703

++	/*

7704

++	 * Interleaved I/O is known to be done by (some) applications

7705

++	 * only for reads, so it does not make sense to merge async

7706

++	 * queues.

7707

++	 */

7708

++	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))

7709

++		return false;

7710

++

7711

++	return true;

7712

++}

7713

++

7714

++/*

7715

++ * Attempt to schedule a merge of bfqq with the currently in-service queue

7716

++ * or with a close queue among the scheduled queues.

7717

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

7718

++ * structure otherwise.

7719

++ *

7720

++ * The OOM queue is not allowed to participate to cooperation: in fact, since

7721

++ * the requests temporarily redirected to the OOM queue could be redirected

7722

++ * again to dedicated queues at any time, the state needed to correctly

7723

++ * handle merging with the OOM queue would be quite complex and expensive

7724

++ * to maintain. Besides, in such a critical condition as an out of memory,

7725

++ * the benefits of queue merging may be little relevant, or even negligible.

7726

++ */

7727

++static struct bfq_queue *

7728

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7729

++		     void *io_struct, bool request)

7730

++{

7731

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

7732

++

7733

++	if (bfqq->new_bfqq)

7734

++		return bfqq->new_bfqq;

7735

++	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

7736

++		return NULL;

7737

++	/* If device has only one backlogged bfq_queue, don't search. */

7738

++	if (bfqd->busy_queues == 1)

7739

++		return NULL;

7740

++

7741

++	in_service_bfqq = bfqd->in_service_queue;

7742

++

7743

++	if (!in_service_bfqq || in_service_bfqq == bfqq ||

7744

++	    !bfqd->in_service_bic ||

7745

++	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

7746

++		goto check_scheduled;

7747

++

7748

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

7749

++	    bfqq->entity.parent == in_service_bfqq->entity.parent &&

7750

++	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {

7751

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

7752

++		if (new_bfqq)

7753

++			return new_bfqq;

7754

++	}

7755

++	/*

7756

++	 * Check whether there is a cooperator among currently scheduled

7757

++	 * queues. The only thing we need is that the bio/request is not

7758

++	 * NULL, as we need it to establish whether a cooperator exists.

7759

++	 */

7760

++check_scheduled:

7761

++	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,

7762

++			bfq_io_struct_pos(io_struct, request));

7763

++

7764

++	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);

7765

++

7766

++	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&

7767

++	    bfq_may_be_close_cooperator(bfqq, new_bfqq))

7768

++		return bfq_setup_merge(bfqq, new_bfqq);

7769

++

7770

++	return NULL;

7771

++}

7772

++

7773

++static void bfq_bfqq_save_state(struct bfq_queue *bfqq)

7774

++{

7775

++	/*

7776

++	 * If !bfqq->bic, the queue is already shared or its requests

7777

++	 * have already been redirected to a shared queue; both idle window

7778

++	 * and weight raising state have already been saved. Do nothing.

7779

++	 */

7780

++	if (!bfqq->bic)

7781

++		return;

7782

++	if (bfqq->bic->wr_time_left)

7783

++		/*

7784

++		 * This is the queue of a just-started process, and would

7785

++		 * deserve weight raising: we set wr_time_left to the full

7786

++		 * weight-raising duration to trigger weight-raising when

7787

++		 * and if the queue is split and the first request of the

7788

++		 * queue is enqueued.

7789

++		 */

7790

++		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

7791

++	else if (bfqq->wr_coeff > 1) {

7792

++		unsigned long wr_duration =

7793

++			jiffies - bfqq->last_wr_start_finish;

7794

++		/*

7795

++		 * It may happen that a queue's weight raising period lasts

7796

++		 * longer than its wr_cur_max_time, as weight raising is

7797

++		 * handled only when a request is enqueued or dispatched (it

7798

++		 * does not use any timer). If the weight raising period is

7799

++		 * about to end, don't save it.

7800

++		 */

7801

++		if (bfqq->wr_cur_max_time <= wr_duration)

7802

++			bfqq->bic->wr_time_left = 0;

7803

++		else

7804

++			bfqq->bic->wr_time_left =

7805

++				bfqq->wr_cur_max_time - wr_duration;

7806

++		/*

7807

++		 * The bfq_queue is becoming shared or the requests of the

7808

++		 * process owning the queue are being redirected to a shared

7809

++		 * queue. Stop the weight raising period of the queue, as in

7810

++		 * both cases it should not be owned by an interactive or

7811

++		 * soft real-time application.

7812

++		 */

7813

++		bfq_bfqq_end_wr(bfqq);

7814

++	} else

7815

++		bfqq->bic->wr_time_left = 0;

7816

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

7817

++	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

7818

++	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

7819

++	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

7820

++	bfqq->bic->cooperations++;

7821

++	bfqq->bic->failed_cooperations = 0;

7822

++}

7823

++

7824

++static void bfq_get_bic_reference(struct bfq_queue *bfqq)

7825

++{

7826

++	/*

7827

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

7828

++	 * is about to begin using a shared bfq_queue.

7829

++	 */

7830

++	if (bfqq->bic)

7831

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

7832

++}

7833

++

7834

++static void

7835

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7836

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7837

++{

7838

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7839

++		(long unsigned)new_bfqq->pid);

7840

++	/* Save weight raising and idle window of the merged queues */

7841

++	bfq_bfqq_save_state(bfqq);

7842

++	bfq_bfqq_save_state(new_bfqq);

7843

++	if (bfq_bfqq_IO_bound(bfqq))

7844

++		bfq_mark_bfqq_IO_bound(new_bfqq);

7845

++	bfq_clear_bfqq_IO_bound(bfqq);

7846

++	/*

7847

++	 * Grab a reference to the bic, to prevent it from being destroyed

7848

++	 * before being possibly touched by a bfq_split_bfqq().

7849

++	 */

7850

++	bfq_get_bic_reference(bfqq);

7851

++	bfq_get_bic_reference(new_bfqq);

7852

++	/*

7853

++	 * Merge queues (that is, let bic redirect its requests to new_bfqq)

7854

++	 */

7855

++	bic_set_bfqq(bic, new_bfqq, 1);

7856

++	bfq_mark_bfqq_coop(new_bfqq);

7857

++	/*

7858

++	 * new_bfqq now belongs to at least two bics (it is a shared queue):

7859

++	 * set new_bfqq->bic to NULL. bfqq either:

7860

++	 * - does not belong to any bic any more, and hence bfqq->bic must

7861

++	 *   be set to NULL, or

7862

++	 * - is a queue whose owning bics have already been redirected to a

7863

++	 *   different queue, hence the queue is destined to not belong to

7864

++	 *   any bic soon and bfqq->bic is already NULL (therefore the next

7865

++	 *   assignment causes no harm).

7866

++	 */

7867

++	new_bfqq->bic = NULL;

7868

++	bfqq->bic = NULL;

7869

++	bfq_put_queue(bfqq);

7870

++}

7871

++

7872

++static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

7873

++{

7874

++	struct bfq_io_cq *bic = bfqq->bic;

7875

++	struct bfq_data *bfqd = bfqq->bfqd;

7876

++

7877

++	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

7878

++		bic->failed_cooperations++;

7879

++		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

7880

++			bic->cooperations = 0;

7881

++	}

7882

++}

7883

++

7884

+ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7885

+ 			   struct bio *bio)

7886

+ {

7887

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

7888

+ 	struct bfq_io_cq *bic;

7889

++	struct bfq_queue *bfqq, *new_bfqq;

7890

+

7891

+ 	/*

7892

+ 	 * Disallow merge of a sync bio into an async request.

7893

+@@ -1150,7 +1622,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7894

+ 	if (!bic)

7895

+ 		return 0;

7896

+

7897

+-	return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq);

7898

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7899

++	/*

7900

++	 * We take advantage of this function to perform an early merge

7901

++	 * of the queues of possible cooperating processes.

7902

++	 */

7903

++	if (bfqq) {

7904

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

7905

++		if (new_bfqq) {

7906

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

7907

++			/*

7908

++			 * If we get here, the bio will be queued in the

7909

++			 * shared queue, i.e., new_bfqq, so use new_bfqq

7910

++			 * to decide whether bio and rq can be merged.

7911

++			 */

7912

++			bfqq = new_bfqq;

7913

++		} else

7914

++			bfq_bfqq_increase_failed_cooperations(bfqq);

7915

++	}

7916

++

7917

++	return bfqq == RQ_BFQQ(rq);

7918

+ }

7919

+

7920

+ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7921

+@@ -1349,6 +1840,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7922

+

7923

+ 	__bfq_bfqd_reset_in_service(bfqd);

7924

+

7925

++	/*

7926

++	 * If this bfqq is shared between multiple processes, check

7927

++	 * to make sure that those processes are still issuing I/Os

7928

++	 * within the mean seek distance. If not, it may be time to

7929

++	 * break the queues apart again.

7930

++	 */

7931

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

7932

++		bfq_mark_bfqq_split_coop(bfqq);

7933

++

7934

+ 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

7935

+ 		/*

7936

+ 		 * Overloading budget_timeout field to store the time

7937

+@@ -1357,8 +1857,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7938

+ 		 */

7939

+ 		bfqq->budget_timeout = jiffies;

7940

+ 		bfq_del_bfqq_busy(bfqd, bfqq, 1);

7941

+-	} else

7942

++	} else {

7943

+ 		bfq_activate_bfqq(bfqd, bfqq);

7944

++		/*

7945

++		 * Resort priority tree of potential close cooperators.

7946

++		 */

7947

++		bfq_pos_tree_add_move(bfqd, bfqq);

7948

++	}

7949

+ }

7950

+

7951

+ /**

7952

+@@ -2242,10 +2747,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7953

+ 		/*

7954

+ 		 * If the queue was activated in a burst, or

7955

+ 		 * too much time has elapsed from the beginning

7956

+-		 * of this weight-raising period, then end weight

7957

+-		 * raising.

7958

++		 * of this weight-raising period, or the queue has

7959

++		 * exceeded the acceptable number of cooperations,

7960

++		 * then end weight raising.

7961

+ 		 */

7962

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

7963

++		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

7964

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

7965

+ 					   bfqq->wr_cur_max_time)) {

7966

+ 			bfqq->last_wr_start_finish = jiffies;

7967

+@@ -2474,6 +2981,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq)

7968

+ #endif

7969

+ }

7970

+

7971

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

7972

++{

7973

++	struct bfq_queue *__bfqq, *next;

7974

++

7975

++	/*

7976

++	 * If this queue was scheduled to merge with another queue, be

7977

++	 * sure to drop the reference taken on that queue (and others in

7978

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

7979

++	 */

7980

++	__bfqq = bfqq->new_bfqq;

7981

++	while (__bfqq) {

7982

++		if (__bfqq == bfqq)

7983

++			break;

7984

++		next = __bfqq->new_bfqq;

7985

++		bfq_put_queue(__bfqq);

7986

++		__bfqq = next;

7987

++	}

7988

++}

7989

++

7990

+ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7991

+ {

7992

+ 	if (bfqq == bfqd->in_service_queue) {

7993

+@@ -2484,6 +3010,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7994

+ 	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

7995

+ 		     atomic_read(&bfqq->ref));

7996

+

7997

++	bfq_put_cooperator(bfqq);

7998

++

7999

+ 	bfq_put_queue(bfqq);

8000

+ }

8001

+

8002

+@@ -2492,6 +3020,25 @@ static void bfq_init_icq(struct io_cq *icq)

8003

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

8004

+

8005

+ 	bic->ttime.last_end_request = jiffies;

8006

++	/*

8007

++	 * A newly created bic indicates that the process has just

8008

++	 * started doing I/O, and is probably mapping into memory its

8009

++	 * executable and libraries: it definitely needs weight raising.

8010

++	 * There is however the possibility that the process performs,

8011

++	 * for a while, I/O close to some other process. EQM intercepts

8012

++	 * this behavior and may merge the queue corresponding to the

8013

++	 * process  with some other queue, BEFORE the weight of the queue

8014

++	 * is raised. Merged queues are not weight-raised (they are assumed

8015

++	 * to belong to processes that benefit only from high throughput).

8016

++	 * If the merge is basically the consequence of an accident, then

8017

++	 * the queue will be split soon and will get back its old weight.

8018

++	 * It is then important to write down somewhere that this queue

8019

++	 * does need weight raising, even if it did not make it to get its

8020

++	 * weight raised before being merged. To this purpose, we overload

8021

++	 * the field raising_time_left and assign 1 to it, to mark the queue

8022

++	 * as needing weight raising.

8023

++	 */

8024

++	bic->wr_time_left = 1;

8025

+ }

8026

+

8027

+ static void bfq_exit_icq(struct io_cq *icq)

8028

+@@ -2505,6 +3052,13 @@ static void bfq_exit_icq(struct io_cq *icq)

8029

+ 	}

8030

+

8031

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

8032

++		/*

8033

++		 * If the bic is using a shared queue, put the reference

8034

++		 * taken on the io_context when the bic started using a

8035

++		 * shared bfq_queue.

8036

++		 */

8037

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

8038

++			put_io_context(icq->ioc);

8039

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

8040

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

8041

+ 	}

8042

+@@ -2809,6 +3363,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

8043

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

8044

+ 		return;

8045

+

8046

++	/* Idle window just restored, statistics are meaningless. */

8047

++	if (bfq_bfqq_just_split(bfqq))

8048

++		return;

8049

++

8050

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

8051

+

8052

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

8053

+@@ -2856,6 +3414,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8054

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

8055

+ 	    !BFQQ_SEEKY(bfqq))

8056

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

8057

++	bfq_clear_bfqq_just_split(bfqq);

8058

+

8059

+ 	bfq_log_bfqq(bfqd, bfqq,

8060

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

8061

+@@ -2920,12 +3479,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8062

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

8063

+ {

8064

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

8065

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8066

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

8067

+

8068

+ 	assert_spin_locked(bfqd->queue->queue_lock);

8069

+

8070

++	/*

8071

++	 * An unplug may trigger a requeue of a request from the device

8072

++	 * driver: make sure we are in process context while trying to

8073

++	 * merge two bfq_queues.

8074

++	 */

8075

++	if (!in_interrupt()) {

8076

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

8077

++		if (new_bfqq) {

8078

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

8079

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

8080

++			/*

8081

++			 * Release the request's reference to the old bfqq

8082

++			 * and make sure one is taken to the shared queue.

8083

++			 */

8084

++			new_bfqq->allocated[rq_data_dir(rq)]++;

8085

++			bfqq->allocated[rq_data_dir(rq)]--;

8086

++			atomic_inc(&new_bfqq->ref);

8087

++			bfq_put_queue(bfqq);

8088

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

8089

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

8090

++						bfqq, new_bfqq);

8091

++			rq->elv.priv[1] = new_bfqq;

8092

++			bfqq = new_bfqq;

8093

++		} else

8094

++			bfq_bfqq_increase_failed_cooperations(bfqq);

8095

++	}

8096

++

8097

+ 	bfq_add_request(rq);

8098

+

8099

++	/*

8100

++	 * Here a newly-created bfq_queue has already started a weight-raising

8101

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

8102

++	 * from assigning it a full weight-raising period. See the detailed

8103

++	 * comments about this field in bfq_init_icq().

8104

++	 */

8105

++	if (bfqq->bic)

8106

++		bfqq->bic->wr_time_left = 0;

8107

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

8108

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

8109

+

8110

+@@ -3094,6 +3688,32 @@ static void bfq_put_request(struct request *rq)

8111

+ }

8112

+

8113

+ /*

8114

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

8115

++ * was the last process referring to said bfqq.

8116

++ */

8117

++static struct bfq_queue *

8118

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

8119

++{

8120

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

8121

++

8122

++	put_io_context(bic->icq.ioc);

8123

++

8124

++	if (bfqq_process_refs(bfqq) == 1) {

8125

++		bfqq->pid = current->pid;

8126

++		bfq_clear_bfqq_coop(bfqq);

8127

++		bfq_clear_bfqq_split_coop(bfqq);

8128

++		return bfqq;

8129

++	}

8130

++

8131

++	bic_set_bfqq(bic, NULL, 1);

8132

++

8133

++	bfq_put_cooperator(bfqq);

8134

++

8135

++	bfq_put_queue(bfqq);

8136

++	return NULL;

8137

++}

8138

++

8139

++/*

8140

+  * Allocate bfq data structures associated with this request.

8141

+  */

8142

+ static int bfq_set_request(struct request_queue *q, struct request *rq,

8143

+@@ -3105,6 +3725,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8144

+ 	const int is_sync = rq_is_sync(rq);

8145

+ 	struct bfq_queue *bfqq;

8146

+ 	unsigned long flags;

8147

++	bool split = false;

8148

+

8149

+ 	might_sleep_if(gfpflags_allow_blocking(gfp_mask));

8150

+

8151

+@@ -3117,15 +3738,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8152

+

8153

+ 	bfq_bic_update_cgroup(bic, bio);

8154

+

8155

++new_queue:

8156

+ 	bfqq = bic_to_bfqq(bic, is_sync);

8157

+ 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

8158

+ 		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);

8159

+ 		bic_set_bfqq(bic, bfqq, is_sync);

8160

+-		if (is_sync) {

8161

+-			if (bfqd->large_burst)

8162

++		if (split && is_sync) {

8163

++			if ((bic->was_in_burst_list && bfqd->large_burst) ||

8164

++			    bic->saved_in_large_burst)

8165

+ 				bfq_mark_bfqq_in_large_burst(bfqq);

8166

+-			else

8167

+-				bfq_clear_bfqq_in_large_burst(bfqq);

8168

++			else {

8169

++			    bfq_clear_bfqq_in_large_burst(bfqq);

8170

++			    if (bic->was_in_burst_list)

8171

++			       hlist_add_head(&bfqq->burst_list_node,

8172

++				              &bfqd->burst_list);

8173

++			}

8174

++		}

8175

++	} else {

8176

++		/* If the queue was seeky for too long, break it apart. */

8177

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

8178

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

8179

++			bfqq = bfq_split_bfqq(bic, bfqq);

8180

++			split = true;

8181

++			if (!bfqq)

8182

++				goto new_queue;

8183

+ 		}

8184

+ 	}

8185

+

8186

+@@ -3137,6 +3773,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8187

+ 	rq->elv.priv[0] = bic;

8188

+ 	rq->elv.priv[1] = bfqq;

8189

+

8190

++	/*

8191

++	 * If a bfq_queue has only one process reference, it is owned

8192

++	 * by only one bfq_io_cq: we can set the bic field of the

8193

++	 * bfq_queue to the address of that structure. Also, if the

8194

++	 * queue has just been split, mark a flag so that the

8195

++	 * information is available to the other scheduler hooks.

8196

++	 */

8197

++	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

8198

++		bfqq->bic = bic;

8199

++		if (split) {

8200

++			bfq_mark_bfqq_just_split(bfqq);

8201

++			/*

8202

++			 * If the queue has just been split from a shared

8203

++			 * queue, restore the idle window and the possible

8204

++			 * weight raising period.

8205

++			 */

8206

++			bfq_bfqq_resume_state(bfqq, bic);

8207

++		}

8208

++	}

8209

++

8210

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

8211

+

8212

+ 	return 0;

8213

+@@ -3290,6 +3946,7 @@ static void bfq_init_root_group(struct bfq_group *root_group,

8214

+ 	root_group->my_entity = NULL;

8215

+ 	root_group->bfqd = bfqd;

8216

+ #endif

8217

++	root_group->rq_pos_tree = RB_ROOT;

8218

+ 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

8219

+ 		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

8220

+ }

8221

+@@ -3370,6 +4027,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

8222

+ 	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

8223

+ 	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

8224

+

8225

++	bfqd->bfq_coop_thresh = 2;

8226

++	bfqd->bfq_failed_cooperations = 7000;

8227

+ 	bfqd->bfq_requests_within_timer = 120;

8228

+

8229

+ 	bfqd->bfq_large_burst_thresh = 11;

8230

+diff --git a/block/bfq.h b/block/bfq.h

8231

+index 485d0c9..f73c942 100644

8232

+--- a/block/bfq.h

8233

++++ b/block/bfq.h

8234

+@@ -183,6 +183,8 @@ struct bfq_group;

8235

+  *                    ioprio_class value.

8236

+  * @new_bfqq: shared bfq_queue if queue is cooperating with

8237

+  *           one or more other queues.

8238

++ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).

8239

++ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).

8240

+  * @sort_list: sorted list of pending requests.

8241

+  * @next_rq: if fifo isn't expired, next request to serve.

8242

+  * @queued: nr of requests queued in @sort_list.

8243

+@@ -304,6 +306,26 @@ struct bfq_ttime {

8244

+  * @ttime: associated @bfq_ttime struct

8245

+  * @ioprio: per (request_queue, blkcg) ioprio.

8246

+  * @blkcg_id: id of the blkcg the related io_cq belongs to.

8247

++ * @wr_time_left: snapshot of the time left before weight raising ends

8248

++ *                for the sync queue associated to this process; this

8249

++ *		  snapshot is taken to remember this value while the weight

8250

++ *		  raising is suspended because the queue is merged with a

8251

++ *		  shared queue, and is used to set @raising_cur_max_time

8252

++ *		  when the queue is split from the shared queue and its

8253

++ *		  weight is raised again

8254

++ * @saved_idle_window: same purpose as the previous field for the idle

8255

++ *                     window

8256

++ * @saved_IO_bound: same purpose as the previous two fields for the I/O

8257

++ *                  bound classification of a queue

8258

++ * @saved_in_large_burst: same purpose as the previous fields for the

8259

++ *                        value of the field keeping the queue's belonging

8260

++ *                        to a large burst

8261

++ * @was_in_burst_list: true if the queue belonged to a burst list

8262

++ *                     before its merge with another cooperating queue

8263

++ * @cooperations: counter of consecutive successful queue merges underwent

8264

++ *                by any of the process' @bfq_queues

8265

++ * @failed_cooperations: counter of consecutive failed queue merges of any

8266

++ *                       of the process' @bfq_queues

8267

+  */

8268

+ struct bfq_io_cq {

8269

+ 	struct io_cq icq; /* must be the first member */

8270

+@@ -314,6 +336,16 @@ struct bfq_io_cq {

8271

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

8272

+ 	uint64_t blkcg_id; /* the current blkcg ID */

8273

+ #endif

8274

++

8275

++	unsigned int wr_time_left;

8276

++	bool saved_idle_window;

8277

++	bool saved_IO_bound;

8278

++

8279

++	bool saved_in_large_burst;

8280

++	bool was_in_burst_list;

8281

++

8282

++	unsigned int cooperations;

8283

++	unsigned int failed_cooperations;

8284

+ };

8285

+

8286

+ enum bfq_device_speed {

8287

+@@ -557,6 +589,9 @@ enum bfqq_state_flags {

8288

+ 					 * may need softrt-next-start

8289

+ 					 * update

8290

+ 					 */

8291

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

8292

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

8293

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

8294

+ };

8295

+

8296

+ #define BFQ_BFQQ_FNS(name)						\

8297

+@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new);

8298

+ BFQ_BFQQ_FNS(IO_bound);

8299

+ BFQ_BFQQ_FNS(in_large_burst);

8300

+ BFQ_BFQQ_FNS(constantly_seeky);

8301

++BFQ_BFQQ_FNS(coop);

8302

++BFQ_BFQQ_FNS(split_coop);

8303

++BFQ_BFQQ_FNS(just_split);

8304

+ BFQ_BFQQ_FNS(softrt_update);

8305

+ #undef BFQ_BFQQ_FNS

8306

+

8307

+@@ -675,6 +713,9 @@ struct bfq_group_data {

8308

+  *                   are groups with more than one active @bfq_entity

8309

+  *                   (see the comments to the function

8310

+  *                   bfq_bfqq_must_not_expire()).

8311

++ * @rq_pos_tree: rbtree sorted by next_request position, used when

8312

++ *               determining if two or more queues have interleaving

8313

++ *               requests (see bfq_find_close_cooperator()).

8314

+  *

8315

+  * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

8316

+  * there is a set of bfq_groups, each one collecting the lower-level

8317

+@@ -701,6 +742,8 @@ struct bfq_group {

8318

+

8319

+ 	int active_entities;

8320

+

8321

++	struct rb_root rq_pos_tree;

8322

++

8323

+ 	struct bfqg_stats stats;

8324

+ 	struct bfqg_stats dead_stats;	/* stats pushed from dead children */

8325

+ };

8326

+@@ -711,6 +754,8 @@ struct bfq_group {

8327

+

8328

+ 	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

8329

+ 	struct bfq_queue *async_idle_bfqq;

8330

++

8331

++	struct rb_root rq_pos_tree;

8332

+ };

8333

+ #endif

8334

+

8335

+@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)

8336

+ 	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

8337

+ }

8338

+

8339

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

8340

++

8341

++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

8342

++{

8343

++	struct bfq_entity *group_entity = bfqq->entity.parent;

8344

++

8345

++	if (!group_entity)

8346

++		group_entity = &bfqq->bfqd->root_group->entity;

8347

++

8348

++	return container_of(group_entity, struct bfq_group, entity);

8349

++}

8350

++

8351

++#else

8352

++

8353

++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

8354

++{

8355

++	return bfqq->bfqd->root_group;

8356

++}

8357

++

8358

++#endif

8359

++

8360

+ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);

8361

+ static void bfq_put_queue(struct bfq_queue *bfqq);

8362

+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

8363

+--

8364

+1.9.1

8365

+

8366

8367

diff --git a/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 b/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1

8368

new file mode 100644

8369

index 0000000..372f093

8370

--- /dev/null

8371

+++ b/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1

8372

@@ -0,0 +1,6361 @@

8373

+From 21d90fdc7488cd7c28f47b5ba759e62c697c0382 Mon Sep 17 00:00:00 2001

8374

+From: Paolo Valente <paolo.valente@××××××.org>

8375

+Date: Tue, 17 May 2016 08:28:04 +0200

8376

+Subject: [PATCH 4/4] block, bfq: turn BFQ-v7r11 for 4.7.0 into BFQ-v8 for

8377

+ 4.7.0

8378

+

8379

+---

8380

+ block/Kconfig.iosched |    2 +-

8381

+ block/bfq-cgroup.c    |  448 +++++----

8382

+ block/bfq-iosched.c   | 2581 +++++++++++++++++++++++++++++--------------------

8383

+ block/bfq-sched.c     |  432 +++++++--

8384

+ block/bfq.h           |  697 +++++++------

8385

+ 5 files changed, 2433 insertions(+), 1727 deletions(-)

8386

+

8387

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

8388

+index f78cd1a..6d92579 100644

8389

+--- a/block/Kconfig.iosched

8390

++++ b/block/Kconfig.iosched

8391

+@@ -53,7 +53,7 @@ config IOSCHED_BFQ

8392

+

8393

+ config BFQ_GROUP_IOSCHED

8394

+ 	bool "BFQ hierarchical scheduling support"

8395

+-	depends on CGROUPS && IOSCHED_BFQ=y

8396

++	depends on IOSCHED_BFQ && BLK_CGROUP

8397

+ 	default n

8398

+ 	---help---

8399

+ 	  Enable hierarchical scheduling in BFQ, using the blkio controller.

8400

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

8401

+index 5ee99ec..bc01663 100644

8402

+--- a/block/bfq-cgroup.c

8403

++++ b/block/bfq-cgroup.c

8404

+@@ -162,7 +162,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)

8405

+ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)

8406

+ {

8407

+ 	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);

8408

+-	BUG_ON(!pd);

8409

+ 	return pd_to_bfqg(pd);

8410

+ }

8411

+

8412

+@@ -224,14 +223,6 @@ static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)

8413

+ 	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);

8414

+ }

8415

+

8416

+-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,

8417

+-					      uint64_t bytes, int rw)

8418

+-{

8419

+-	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);

8420

+-	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);

8421

+-	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);

8422

+-}

8423

+-

8424

+ static void bfqg_stats_update_completion(struct bfq_group *bfqg,

8425

+ 			uint64_t start_time, uint64_t io_start_time, int rw)

8426

+ {

8427

+@@ -248,17 +239,11 @@ static void bfqg_stats_update_completion(struct bfq_group *bfqg,

8428

+ /* @stats = 0 */

8429

+ static void bfqg_stats_reset(struct bfqg_stats *stats)

8430

+ {

8431

+-	if (!stats)

8432

+-		return;

8433

+-

8434

+ 	/* queued stats shouldn't be cleared */

8435

+-	blkg_rwstat_reset(&stats->service_bytes);

8436

+-	blkg_rwstat_reset(&stats->serviced);

8437

+ 	blkg_rwstat_reset(&stats->merged);

8438

+ 	blkg_rwstat_reset(&stats->service_time);

8439

+ 	blkg_rwstat_reset(&stats->wait_time);

8440

+ 	blkg_stat_reset(&stats->time);

8441

+-	blkg_stat_reset(&stats->unaccounted_time);

8442

+ 	blkg_stat_reset(&stats->avg_queue_size_sum);

8443

+ 	blkg_stat_reset(&stats->avg_queue_size_samples);

8444

+ 	blkg_stat_reset(&stats->dequeue);

8445

+@@ -268,21 +253,19 @@ static void bfqg_stats_reset(struct bfqg_stats *stats)

8446

+ }

8447

+

8448

+ /* @to += @from */

8449

+-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)

8450

++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)

8451

+ {

8452

+ 	if (!to || !from)

8453

+ 		return;

8454

+

8455

+ 	/* queued stats shouldn't be cleared */

8456

+-	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);

8457

+-	blkg_rwstat_add_aux(&to->serviced, &from->serviced);

8458

+ 	blkg_rwstat_add_aux(&to->merged, &from->merged);

8459

+ 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);

8460

+ 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);

8461

+ 	blkg_stat_add_aux(&from->time, &from->time);

8462

+-	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);

8463

+ 	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);

8464

+-	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);

8465

++	blkg_stat_add_aux(&to->avg_queue_size_samples,

8466

++			  &from->avg_queue_size_samples);

8467

+ 	blkg_stat_add_aux(&to->dequeue, &from->dequeue);

8468

+ 	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);

8469

+ 	blkg_stat_add_aux(&to->idle_time, &from->idle_time);

8470

+@@ -308,10 +291,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)

8471

+ 	if (unlikely(!parent))

8472

+ 		return;

8473

+

8474

+-	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);

8475

+-	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);

8476

++	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);

8477

+ 	bfqg_stats_reset(&bfqg->stats);

8478

+-	bfqg_stats_reset(&bfqg->dead_stats);

8479

+ }

8480

+

8481

+ static void bfq_init_entity(struct bfq_entity *entity,

8482

+@@ -332,15 +313,11 @@ static void bfq_init_entity(struct bfq_entity *entity,

8483

+

8484

+ static void bfqg_stats_exit(struct bfqg_stats *stats)

8485

+ {

8486

+-	blkg_rwstat_exit(&stats->service_bytes);

8487

+-	blkg_rwstat_exit(&stats->serviced);

8488

+ 	blkg_rwstat_exit(&stats->merged);

8489

+ 	blkg_rwstat_exit(&stats->service_time);

8490

+ 	blkg_rwstat_exit(&stats->wait_time);

8491

+ 	blkg_rwstat_exit(&stats->queued);

8492

+-	blkg_stat_exit(&stats->sectors);

8493

+ 	blkg_stat_exit(&stats->time);

8494

+-	blkg_stat_exit(&stats->unaccounted_time);

8495

+ 	blkg_stat_exit(&stats->avg_queue_size_sum);

8496

+ 	blkg_stat_exit(&stats->avg_queue_size_samples);

8497

+ 	blkg_stat_exit(&stats->dequeue);

8498

+@@ -351,15 +328,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)

8499

+

8500

+ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)

8501

+ {

8502

+-	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||

8503

+-	    blkg_rwstat_init(&stats->serviced, gfp) ||

8504

+-	    blkg_rwstat_init(&stats->merged, gfp) ||

8505

++	if (blkg_rwstat_init(&stats->merged, gfp) ||

8506

+ 	    blkg_rwstat_init(&stats->service_time, gfp) ||

8507

+ 	    blkg_rwstat_init(&stats->wait_time, gfp) ||

8508

+ 	    blkg_rwstat_init(&stats->queued, gfp) ||

8509

+-	    blkg_stat_init(&stats->sectors, gfp) ||

8510

+ 	    blkg_stat_init(&stats->time, gfp) ||

8511

+-	    blkg_stat_init(&stats->unaccounted_time, gfp) ||

8512

+ 	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||

8513

+ 	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||

8514

+ 	    blkg_stat_init(&stats->dequeue, gfp) ||

8515

+@@ -374,20 +347,36 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)

8516

+ }

8517

+

8518

+ static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)

8519

+- {

8520

++{

8521

+ 	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;

8522

+- }

8523

++}

8524

+

8525

+ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)

8526

+ {

8527

+ 	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));

8528

+ }

8529

+

8530

++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)

8531

++{

8532

++	struct bfq_group_data *bgd;

8533

++

8534

++	bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);

8535

++	if (!bgd)

8536

++		return NULL;

8537

++	return &bgd->pd;

8538

++}

8539

++

8540

+ static void bfq_cpd_init(struct blkcg_policy_data *cpd)

8541

+ {

8542

+ 	struct bfq_group_data *d = cpd_to_bfqgd(cpd);

8543

+

8544

+-	d->weight = BFQ_DEFAULT_GRP_WEIGHT;

8545

++	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?

8546

++		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;

8547

++}

8548

++

8549

++static void bfq_cpd_free(struct blkcg_policy_data *cpd)

8550

++{

8551

++	kfree(cpd_to_bfqgd(cpd));

8552

+ }

8553

+

8554

+ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)

8555

+@@ -398,8 +387,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)

8556

+ 	if (!bfqg)

8557

+ 		return NULL;

8558

+

8559

+-	if (bfqg_stats_init(&bfqg->stats, gfp) ||

8560

+-	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {

8561

++	if (bfqg_stats_init(&bfqg->stats, gfp)) {

8562

+ 		kfree(bfqg);

8563

+ 		return NULL;

8564

+ 	}

8565

+@@ -407,27 +395,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)

8566

+ 	return &bfqg->pd;

8567

+ }

8568

+

8569

+-static void bfq_group_set_parent(struct bfq_group *bfqg,

8570

+-					struct bfq_group *parent)

8571

++static void bfq_pd_init(struct blkg_policy_data *pd)

8572

+ {

8573

++	struct blkcg_gq *blkg;

8574

++	struct bfq_group *bfqg;

8575

++	struct bfq_data *bfqd;

8576

+ 	struct bfq_entity *entity;

8577

++	struct bfq_group_data *d;

8578

+

8579

+-	BUG_ON(!parent);

8580

+-	BUG_ON(!bfqg);

8581

+-	BUG_ON(bfqg == parent);

8582

+-

8583

++	blkg = pd_to_blkg(pd);

8584

++	BUG_ON(!blkg);

8585

++	bfqg = blkg_to_bfqg(blkg);

8586

++	bfqd = blkg->q->elevator->elevator_data;

8587

+ 	entity = &bfqg->entity;

8588

+-	entity->parent = parent->my_entity;

8589

+-	entity->sched_data = &parent->sched_data;

8590

+-}

8591

+-

8592

+-static void bfq_pd_init(struct blkg_policy_data *pd)

8593

+-{

8594

+-	struct blkcg_gq *blkg = pd_to_blkg(pd);

8595

+-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);

8596

+-	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;

8597

+-	struct bfq_entity *entity = &bfqg->entity;

8598

+-	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);

8599

++	d = blkcg_to_bfqgd(blkg->blkcg);

8600

+

8601

+ 	entity->orig_weight = entity->weight = entity->new_weight = d->weight;

8602

+ 	entity->my_sched_data = &bfqg->sched_data;

8603

+@@ -445,45 +426,28 @@ static void bfq_pd_free(struct blkg_policy_data *pd)

8604

+ 	struct bfq_group *bfqg = pd_to_bfqg(pd);

8605

+

8606

+ 	bfqg_stats_exit(&bfqg->stats);

8607

+-	bfqg_stats_exit(&bfqg->dead_stats);

8608

+-

8609

+ 	return kfree(bfqg);

8610

+ }

8611

+

8612

+-/* offset delta from bfqg->stats to bfqg->dead_stats */

8613

+-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -

8614

+-					offsetof(struct bfq_group, stats);

8615

+-

8616

+-/* to be used by recursive prfill, sums live and dead stats recursively */

8617

+-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)

8618

++static void bfq_pd_reset_stats(struct blkg_policy_data *pd)

8619

+ {

8620

+-	u64 sum = 0;

8621

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

8622

+

8623

+-	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

8624

+-	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

8625

+-				       off + dead_stats_off_delta);

8626

+-	return sum;

8627

++	bfqg_stats_reset(&bfqg->stats);

8628

+ }

8629

+

8630

+-/* to be used by recursive prfill, sums live and dead rwstats recursively */

8631

+-static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,

8632

+-						       int off)

8633

++static void bfq_group_set_parent(struct bfq_group *bfqg,

8634

++					struct bfq_group *parent)

8635

+ {

8636

+-	struct blkg_rwstat a, b;

8637

+-

8638

+-	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

8639

+-	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

8640

+-				      off + dead_stats_off_delta);

8641

+-	blkg_rwstat_add_aux(&a, &b);

8642

+-	return a;

8643

+-}

8644

++	struct bfq_entity *entity;

8645

+

8646

+-static void bfq_pd_reset_stats(struct blkg_policy_data *pd)

8647

+-{

8648

+-	struct bfq_group *bfqg = pd_to_bfqg(pd);

8649

++	BUG_ON(!parent);

8650

++	BUG_ON(!bfqg);

8651

++	BUG_ON(bfqg == parent);

8652

+

8653

+-	bfqg_stats_reset(&bfqg->stats);

8654

+-	bfqg_stats_reset(&bfqg->dead_stats);

8655

++	entity = &bfqg->entity;

8656

++	entity->parent = parent->my_entity;

8657

++	entity->sched_data = &parent->sched_data;

8658

+ }

8659

+

8660

+ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

8661

+@@ -531,13 +495,18 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

8662

+ 	return bfqg;

8663

+ }

8664

+

8665

+-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);

8666

++static void bfq_pos_tree_add_move(struct bfq_data *bfqd,

8667

++				  struct bfq_queue *bfqq);

8668

++

8669

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

8670

++			    struct bfq_queue *bfqq,

8671

++			    bool compensate,

8672

++			    enum bfqq_expiration reason);

8673

+

8674

+ /**

8675

+  * bfq_bfqq_move - migrate @bfqq to @bfqg.

8676

+  * @bfqd: queue descriptor.

8677

+  * @bfqq: the queue to move.

8678

+- * @entity: @bfqq's entity.

8679

+  * @bfqg: the group to move to.

8680

+  *

8681

+  * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

8682

+@@ -548,26 +517,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8683

+  * rcu_read_lock()).

8684

+  */

8685

+ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8686

+-			  struct bfq_entity *entity, struct bfq_group *bfqg)

8687

++			  struct bfq_group *bfqg)

8688

+ {

8689

+-	int busy, resume;

8690

++	struct bfq_entity *entity = &bfqq->entity;

8691

+

8692

+-	busy = bfq_bfqq_busy(bfqq);

8693

+-	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

8694

+-

8695

+-	BUG_ON(resume && !entity->on_st);

8696

+-	BUG_ON(busy && !resume && entity->on_st &&

8697

++	BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));

8698

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);

8699

++	BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)

8700

++	       && entity->on_st &&

8701

+ 	       bfqq != bfqd->in_service_queue);

8702

++	BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);

8703

++

8704

++	/* If bfqq is empty, then bfq_bfqq_expire also invokes

8705

++	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity

8706

++	 * from data structures related to current group. Otherwise we

8707

++	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as

8708

++	 * we do below.

8709

++	 */

8710

++	if (bfqq == bfqd->in_service_queue)

8711

++		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,

8712

++				false, BFQ_BFQQ_PREEMPTED);

8713

++

8714

++	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)

8715

++	    && &bfq_entity_service_tree(entity)->idle !=

8716

++	       entity->tree);

8717

+

8718

+-	if (busy) {

8719

+-		BUG_ON(atomic_read(&bfqq->ref) < 2);

8720

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));

8721

+

8722

+-		if (!resume)

8723

+-			bfq_del_bfqq_busy(bfqd, bfqq, 0);

8724

+-		else

8725

+-			bfq_deactivate_bfqq(bfqd, bfqq, 0);

8726

+-	} else if (entity->on_st)

8727

++	if (bfq_bfqq_busy(bfqq))

8728

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

8729

++	else if (entity->on_st) {

8730

++		BUG_ON(&bfq_entity_service_tree(entity)->idle !=

8731

++		       entity->tree);

8732

+ 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

8733

++	}

8734

+ 	bfqg_put(bfqq_group(bfqq));

8735

+

8736

+ 	/*

8737

+@@ -579,14 +562,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8738

+ 	entity->sched_data = &bfqg->sched_data;

8739

+ 	bfqg_get(bfqg);

8740

+

8741

+-	if (busy) {

8742

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));

8743

++	if (bfq_bfqq_busy(bfqq)) {

8744

+ 		bfq_pos_tree_add_move(bfqd, bfqq);

8745

+-		if (resume)

8746

+-			bfq_activate_bfqq(bfqd, bfqq);

8747

++		bfq_activate_bfqq(bfqd, bfqq);

8748

+ 	}

8749

+

8750

+ 	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)

8751

+ 		bfq_schedule_dispatch(bfqd);

8752

++	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)

8753

++	       && &bfq_entity_service_tree(entity)->idle !=

8754

++	       entity->tree);

8755

+ }

8756

+

8757

+ /**

8758

+@@ -621,7 +607,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

8759

+ 			bic_set_bfqq(bic, NULL, 0);

8760

+ 			bfq_log_bfqq(bfqd, async_bfqq,

8761

+ 				     "bic_change_group: %p %d",

8762

+-				     async_bfqq, atomic_read(&async_bfqq->ref));

8763

++				     async_bfqq,

8764

++				     async_bfqq->ref);

8765

+ 			bfq_put_queue(async_bfqq);

8766

+ 		}

8767

+ 	}

8768

+@@ -629,7 +616,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

8769

+ 	if (sync_bfqq) {

8770

+ 		entity = &sync_bfqq->entity;

8771

+ 		if (entity->sched_data != &bfqg->sched_data)

8772

+-			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

8773

++			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);

8774

+ 	}

8775

+

8776

+ 	return bfqg;

8777

+@@ -638,25 +625,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

8778

+ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

8779

+ {

8780

+ 	struct bfq_data *bfqd = bic_to_bfqd(bic);

8781

+-	struct blkcg *blkcg;

8782

+ 	struct bfq_group *bfqg = NULL;

8783

+-	uint64_t id;

8784

++	uint64_t serial_nr;

8785

+

8786

+ 	rcu_read_lock();

8787

+-	blkcg = bio_blkcg(bio);

8788

+-	id = blkcg->css.serial_nr;

8789

+-	rcu_read_unlock();

8790

++	serial_nr = bio_blkcg(bio)->css.serial_nr;

8791

+

8792

+ 	/*

8793

+ 	 * Check whether blkcg has changed.  The condition may trigger

8794

+ 	 * spuriously on a newly created cic but there's no harm.

8795

+ 	 */

8796

+-	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))

8797

+-		return;

8798

++	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))

8799

++		goto out;

8800

+

8801

+-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);

8802

+-	BUG_ON(!bfqg);

8803

+-	bic->blkcg_id = id;

8804

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));

8805

++	bic->blkcg_serial_nr = serial_nr;

8806

++out:

8807

++	rcu_read_unlock();

8808

+ }

8809

+

8810

+ /**

8811

+@@ -682,8 +667,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

8812

+ 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

8813

+

8814

+ 	BUG_ON(!bfqq);

8815

+-	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

8816

+-	return;

8817

++	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);

8818

+ }

8819

+

8820

+ /**

8821

+@@ -711,16 +695,15 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,

8822

+ 	if (bfqg->sched_data.in_service_entity)

8823

+ 		bfq_reparent_leaf_entity(bfqd,

8824

+ 			bfqg->sched_data.in_service_entity);

8825

+-

8826

+-	return;

8827

+ }

8828

+

8829

+ /**

8830

+- * bfq_destroy_group - destroy @bfqg.

8831

+- * @bfqg: the group being destroyed.

8832

++ * bfq_pd_offline - deactivate the entity associated with @pd,

8833

++ *		    and reparent its children entities.

8834

++ * @pd: descriptor of the policy going offline.

8835

+  *

8836

+- * Destroy @bfqg, making sure that it is not referenced from its parent.

8837

+- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic

8838

++ * blkio already grabs the queue_lock for us, so no need to use

8839

++ * RCU-based magic

8840

+  */

8841

+ static void bfq_pd_offline(struct blkg_policy_data *pd)

8842

+ {

8843

+@@ -779,6 +762,12 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)

8844

+ 	bfq_put_async_queues(bfqd, bfqg);

8845

+ 	BUG_ON(entity->tree);

8846

+

8847

++	/*

8848

++	 * @blkg is going offline and will be ignored by

8849

++	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so

8850

++	 * that they don't get lost.  If IOs complete after this point, the

8851

++	 * stats for them will be lost.  Oh well...

8852

++	 */

8853

+ 	bfqg_stats_xfer_dead(bfqg);

8854

+ }

8855

+

8856

+@@ -788,46 +777,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd)

8857

+

8858

+ 	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {

8859

+ 		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

8860

++		BUG_ON(!bfqg);

8861

+

8862

+ 		bfq_end_wr_async_queues(bfqd, bfqg);

8863

+ 	}

8864

+ 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

8865

+ }

8866

+

8867

+-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,

8868

+-				       struct cftype *cftype)

8869

+-{

8870

+-	struct blkcg *blkcg = css_to_blkcg(css);

8871

+-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

8872

+-	int ret = -EINVAL;

8873

+-

8874

+-	spin_lock_irq(&blkcg->lock);

8875

+-	ret = bfqgd->weight;

8876

+-	spin_unlock_irq(&blkcg->lock);

8877

+-

8878

+-	return ret;

8879

+-}

8880

+-

8881

+-static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)

8882

++static int bfq_io_show_weight(struct seq_file *sf, void *v)

8883

+ {

8884

+ 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));

8885

+ 	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

8886

++	unsigned int val = 0;

8887

+

8888

+-	spin_lock_irq(&blkcg->lock);

8889

+-	seq_printf(sf, "%u\n", bfqgd->weight);

8890

+-	spin_unlock_irq(&blkcg->lock);

8891

++	if (bfqgd)

8892

++		val = bfqgd->weight;

8893

++

8894

++	seq_printf(sf, "%u\n", val);

8895

+

8896

+ 	return 0;

8897

+ }

8898

+

8899

+-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,

8900

+-					struct cftype *cftype,

8901

+-					u64 val)

8902

++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,

8903

++				    struct cftype *cftype,

8904

++				    u64 val)

8905

+ {

8906

+ 	struct blkcg *blkcg = css_to_blkcg(css);

8907

+ 	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

8908

+ 	struct blkcg_gq *blkg;

8909

+-	int ret = -EINVAL;

8910

++	int ret = -ERANGE;

8911

+

8912

+ 	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)

8913

+ 		return ret;

8914

+@@ -837,6 +815,7 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,

8915

+ 	bfqgd->weight = (unsigned short)val;

8916

+ 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {

8917

+ 		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

8918

++

8919

+ 		if (!bfqg)

8920

+ 			continue;

8921

+ 		/*

8922

+@@ -871,13 +850,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,

8923

+ 	return ret;

8924

+ }

8925

+

8926

+-static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,

8927

+-					     char *buf, size_t nbytes,

8928

+-					     loff_t off)

8929

++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,

8930

++				 char *buf, size_t nbytes,

8931

++				 loff_t off)

8932

+ {

8933

++	u64 weight;

8934

+ 	/* First unsigned long found in the file is used */

8935

+-	return bfqio_cgroup_weight_write(of_css(of), NULL,

8936

+-					 simple_strtoull(strim(buf), NULL, 0));

8937

++	int ret = kstrtoull(strim(buf), 0, &weight);

8938

++

8939

++	if (ret)

8940

++		return ret;

8941

++

8942

++	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);

8943

+ }

8944

+

8945

+ static int bfqg_print_stat(struct seq_file *sf, void *v)

8946

+@@ -897,16 +881,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)

8947

+ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,

8948

+ 				      struct blkg_policy_data *pd, int off)

8949

+ {

8950

+-	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);

8951

+-

8952

++	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),

8953

++					  &blkcg_policy_bfq, off);

8954

+ 	return __blkg_prfill_u64(sf, pd, sum);

8955

+ }

8956

+

8957

+ static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,

8958

+ 					struct blkg_policy_data *pd, int off)

8959

+ {

8960

+-	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);

8961

+-

8962

++	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),

8963

++							   &blkcg_policy_bfq,

8964

++							   off);

8965

+ 	return __blkg_prfill_rwstat(sf, pd, &sum);

8966

+ }

8967

+

8968

+@@ -926,6 +911,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)

8969

+ 	return 0;

8970

+ }

8971

+

8972

++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,

8973

++			       int off)

8974

++{

8975

++	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);

8976

++

8977

++	return __blkg_prfill_u64(sf, pd, sum >> 9);

8978

++}

8979

++

8980

++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)

8981

++{

8982

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

8983

++			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);

8984

++	return 0;

8985

++}

8986

++

8987

++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,

8988

++					 struct blkg_policy_data *pd, int off)

8989

++{

8990

++	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,

8991

++					offsetof(struct blkcg_gq, stat_bytes));

8992

++	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +

8993

++		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);

8994

++

8995

++	return __blkg_prfill_u64(sf, pd, sum >> 9);

8996

++}

8997

++

8998

++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)

8999

++{

9000

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

9001

++			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,

9002

++			  false);

9003

++	return 0;

9004

++}

9005

++

9006

++

9007

+ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,

9008

+ 				      struct blkg_policy_data *pd, int off)

9009

+ {

9010

+@@ -950,7 +970,8 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)

9011

+ 	return 0;

9012

+ }

9013

+

9014

+-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

9015

++static struct bfq_group *

9016

++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

9017

+ {

9018

+ 	int ret;

9019

+

9020

+@@ -958,41 +979,18 @@ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int n

9021

+ 	if (ret)

9022

+ 		return NULL;

9023

+

9024

+-        return blkg_to_bfqg(bfqd->queue->root_blkg);

9025

+-}

9026

+-

9027

+-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)

9028

+-{

9029

+-        struct bfq_group_data *bgd;

9030

+-

9031

+-        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);

9032

+-        if (!bgd)

9033

+-                return NULL;

9034

+-        return &bgd->pd;

9035

++	return blkg_to_bfqg(bfqd->queue->root_blkg);

9036

+ }

9037

+

9038

+-static void bfq_cpd_free(struct blkcg_policy_data *cpd)

9039

+-{

9040

+-        kfree(cpd_to_bfqgd(cpd));

9041

+-}

9042

+-

9043

+-static struct cftype bfqio_files_dfl[] = {

9044

++static struct cftype bfq_blkcg_legacy_files[] = {

9045

+ 	{

9046

+-		.name = "weight",

9047

++		.name = "bfq.weight",

9048

+ 		.flags = CFTYPE_NOT_ON_ROOT,

9049

+-		.seq_show = bfqio_cgroup_weight_read_dfl,

9050

+-		.write = bfqio_cgroup_weight_write_dfl,

9051

++		.seq_show = bfq_io_show_weight,

9052

++		.write_u64 = bfq_io_set_weight_legacy,

9053

+ 	},

9054

+-	{} /* terminate */

9055

+-};

9056

+

9057

+-static struct cftype bfqio_files[] = {

9058

+-	{

9059

+-		.name = "bfq.weight",

9060

+-		.read_u64 = bfqio_cgroup_weight_read,

9061

+-		.write_u64 = bfqio_cgroup_weight_write,

9062

+-	},

9063

+-	/* statistics, cover only the tasks in the bfqg */

9064

++	/* statistics, covers only the tasks in the bfqg */

9065

+ 	{

9066

+ 		.name = "bfq.time",

9067

+ 		.private = offsetof(struct bfq_group, stats.time),

9068

+@@ -1000,18 +998,17 @@ static struct cftype bfqio_files[] = {

9069

+ 	},

9070

+ 	{

9071

+ 		.name = "bfq.sectors",

9072

+-		.private = offsetof(struct bfq_group, stats.sectors),

9073

+-		.seq_show = bfqg_print_stat,

9074

++		.seq_show = bfqg_print_stat_sectors,

9075

+ 	},

9076

+ 	{

9077

+ 		.name = "bfq.io_service_bytes",

9078

+-		.private = offsetof(struct bfq_group, stats.service_bytes),

9079

+-		.seq_show = bfqg_print_rwstat,

9080

++		.private = (unsigned long)&blkcg_policy_bfq,

9081

++		.seq_show = blkg_print_stat_bytes,

9082

+ 	},

9083

+ 	{

9084

+ 		.name = "bfq.io_serviced",

9085

+-		.private = offsetof(struct bfq_group, stats.serviced),

9086

+-		.seq_show = bfqg_print_rwstat,

9087

++		.private = (unsigned long)&blkcg_policy_bfq,

9088

++		.seq_show = blkg_print_stat_ios,

9089

+ 	},

9090

+ 	{

9091

+ 		.name = "bfq.io_service_time",

9092

+@@ -1042,18 +1039,17 @@ static struct cftype bfqio_files[] = {

9093

+ 	},

9094

+ 	{

9095

+ 		.name = "bfq.sectors_recursive",

9096

+-		.private = offsetof(struct bfq_group, stats.sectors),

9097

+-		.seq_show = bfqg_print_stat_recursive,

9098

++		.seq_show = bfqg_print_stat_sectors_recursive,

9099

+ 	},

9100

+ 	{

9101

+ 		.name = "bfq.io_service_bytes_recursive",

9102

+-		.private = offsetof(struct bfq_group, stats.service_bytes),

9103

+-		.seq_show = bfqg_print_rwstat_recursive,

9104

++		.private = (unsigned long)&blkcg_policy_bfq,

9105

++		.seq_show = blkg_print_stat_bytes_recursive,

9106

+ 	},

9107

+ 	{

9108

+ 		.name = "bfq.io_serviced_recursive",

9109

+-		.private = offsetof(struct bfq_group, stats.serviced),

9110

+-		.seq_show = bfqg_print_rwstat_recursive,

9111

++		.private = (unsigned long)&blkcg_policy_bfq,

9112

++		.seq_show = blkg_print_stat_ios_recursive,

9113

+ 	},

9114

+ 	{

9115

+ 		.name = "bfq.io_service_time_recursive",

9116

+@@ -1099,32 +1095,35 @@ static struct cftype bfqio_files[] = {

9117

+ 		.private = offsetof(struct bfq_group, stats.dequeue),

9118

+ 		.seq_show = bfqg_print_stat,

9119

+ 	},

9120

+-	{

9121

+-		.name = "bfq.unaccounted_time",

9122

+-		.private = offsetof(struct bfq_group, stats.unaccounted_time),

9123

+-		.seq_show = bfqg_print_stat,

9124

+-	},

9125

+ 	{ }	/* terminate */

9126

+ };

9127

+

9128

+-static struct blkcg_policy blkcg_policy_bfq = {

9129

+-       .dfl_cftypes            = bfqio_files_dfl,

9130

+-       .legacy_cftypes         = bfqio_files,

9131

+-

9132

+-       .pd_alloc_fn            = bfq_pd_alloc,

9133

+-       .pd_init_fn             = bfq_pd_init,

9134

+-       .pd_offline_fn          = bfq_pd_offline,

9135

+-       .pd_free_fn             = bfq_pd_free,

9136

+-       .pd_reset_stats_fn      = bfq_pd_reset_stats,

9137

+-

9138

+-       .cpd_alloc_fn           = bfq_cpd_alloc,

9139

+-       .cpd_init_fn            = bfq_cpd_init,

9140

+-       .cpd_bind_fn	       = bfq_cpd_init,

9141

+-       .cpd_free_fn            = bfq_cpd_free,

9142

+-

9143

++static struct cftype bfq_blkg_files[] = {

9144

++	{

9145

++		.name = "bfq.weight",

9146

++		.flags = CFTYPE_NOT_ON_ROOT,

9147

++		.seq_show = bfq_io_show_weight,

9148

++		.write = bfq_io_set_weight,

9149

++	},

9150

++	{} /* terminate */

9151

+ };

9152

+

9153

+-#else

9154

++#else /* CONFIG_BFQ_GROUP_IOSCHED */

9155

++

9156

++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,

9157

++			struct bfq_queue *bfqq, int rw) { }

9158

++static inline void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) { }

9159

++static inline void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) { }

9160

++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,

9161

++			uint64_t start_time, uint64_t io_start_time, int rw) { }

9162

++static inline void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,

9163

++struct bfq_group *curr_bfqg) { }

9164

++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }

9165

++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }

9166

++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }

9167

++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }

9168

++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }

9169

++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }

9170

+

9171

+ static void bfq_init_entity(struct bfq_entity *entity,

9172

+ 			    struct bfq_group *bfqg)

9173

+@@ -1146,29 +1145,22 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

9174

+ 	return bfqd->root_group;

9175

+ }

9176

+

9177

+-static void bfq_bfqq_move(struct bfq_data *bfqd,

9178

+-			  struct bfq_queue *bfqq,

9179

+-			  struct bfq_entity *entity,

9180

+-			  struct bfq_group *bfqg)

9181

+-{

9182

+-}

9183

+-

9184

+ static void bfq_end_wr_async(struct bfq_data *bfqd)

9185

+ {

9186

+ 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

9187

+ }

9188

+

9189

+-static void bfq_disconnect_groups(struct bfq_data *bfqd)

9190

+-{

9191

+-	bfq_put_async_queues(bfqd, bfqd->root_group);

9192

+-}

9193

+-

9194

+ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

9195

+                                               struct blkcg *blkcg)

9196

+ {

9197

+ 	return bfqd->root_group;

9198

+ }

9199

+

9200

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)

9201

++{

9202

++	return bfqq->bfqd->root_group;

9203

++}

9204

++

9205

+ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

9206

+ {

9207

+ 	struct bfq_group *bfqg;

9208

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

9209

+index d1f648d..5469442 100644

9210

+--- a/block/bfq-iosched.c

9211

++++ b/block/bfq-iosched.c

9212

+@@ -7,25 +7,26 @@

9213

+  * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

9214

+  *		      Paolo Valente <paolo.valente@×××××××.it>

9215

+  *

9216

+- * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

9217

++ * Copyright (C) 2016 Paolo Valente <paolo.valente@×××××××.it>

9218

+  *

9219

+  * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

9220

+  * file.

9221

+  *

9222

+- * BFQ is a proportional-share storage-I/O scheduling algorithm based on

9223

+- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

9224

+- * measured in number of sectors, to processes instead of time slices. The

9225

+- * device is not granted to the in-service process for a given time slice,

9226

+- * but until it has exhausted its assigned budget. This change from the time

9227

+- * to the service domain allows BFQ to distribute the device throughput

9228

+- * among processes as desired, without any distortion due to ZBR, workload

9229

+- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

9230

+- * called B-WF2Q+, to schedule processes according to their budgets. More

9231

+- * precisely, BFQ schedules queues associated to processes. Thanks to the

9232

+- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

9233

+- * I/O-bound processes issuing sequential requests (to boost the

9234

+- * throughput), and yet guarantee a low latency to interactive and soft

9235

+- * real-time applications.

9236

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based

9237

++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns

9238

++ * budgets, measured in number of sectors, to processes instead of

9239

++ * time slices. The device is not granted to the in-service process

9240

++ * for a given time slice, but until it has exhausted its assigned

9241

++ * budget. This change from the time to the service domain enables BFQ

9242

++ * to distribute the device throughput among processes as desired,

9243

++ * without any distortion due to throughput fluctuations, or to device

9244

++ * internal queueing. BFQ uses an ad hoc internal scheduler, called

9245

++ * B-WF2Q+, to schedule processes according to their budgets. More

9246

++ * precisely, BFQ schedules queues associated with processes. Thanks to

9247

++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high

9248

++ * budgets to I/O-bound processes issuing sequential requests (to

9249

++ * boost the throughput), and yet guarantee a low latency to

9250

++ * interactive and soft real-time applications.

9251

+  *

9252

+  * BFQ is described in [1], where also a reference to the initial, more

9253

+  * theoretical paper on BFQ can be found. The interested reader can find

9254

+@@ -87,7 +88,6 @@ static const int bfq_stats_min_budgets = 194;

9255

+

9256

+ /* Default maximum budget values, in sectors and number of requests. */

9257

+ static const int bfq_default_max_budget = 16 * 1024;

9258

+-static const int bfq_max_budget_async_rq = 4;

9259

+

9260

+ /*

9261

+  * Async to sync throughput distribution is controlled as follows:

9262

+@@ -97,8 +97,7 @@ static const int bfq_max_budget_async_rq = 4;

9263

+ static const int bfq_async_charge_factor = 10;

9264

+

9265

+ /* Default timeout values, in jiffies, approximating CFQ defaults. */

9266

+-static const int bfq_timeout_sync = HZ / 8;

9267

+-static int bfq_timeout_async = HZ / 25;

9268

++static const int bfq_timeout = HZ / 8;

9269

+

9270

+ struct kmem_cache *bfq_pool;

9271

+

9272

+@@ -109,8 +108,9 @@ struct kmem_cache *bfq_pool;

9273

+ #define BFQ_HW_QUEUE_THRESHOLD	4

9274

+ #define BFQ_HW_QUEUE_SAMPLES	32

9275

+

9276

+-#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

9277

+-#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

9278

++#define BFQQ_SEEK_THR	 	(sector_t)(8 * 100)

9279

++#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)

9280

++#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)

9281

+

9282

+ /* Min samples used for peak rate estimation (for autotuning). */

9283

+ #define BFQ_PEAK_RATE_SAMPLES	32

9284

+@@ -141,16 +141,24 @@ struct kmem_cache *bfq_pool;

9285

+  * The device's speed class is dynamically (re)detected in

9286

+  * bfq_update_peak_rate() every time the estimated peak rate is updated.

9287

+  *

9288

+- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

9289

+- * are the reference values for a slow/fast rotational device, whereas

9290

+- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

9291

+- * a slow/fast non-rotational device. Finally, device_speed_thresh are the

9292

+- * thresholds used to switch between speed classes.

9293

++ * In the following definitions, R_slow[0]/R_fast[0] and

9294

++ * T_slow[0]/T_fast[0] are the reference values for a slow/fast

9295

++ * rotational device, whereas R_slow[1]/R_fast[1] and

9296

++ * T_slow[1]/T_fast[1] are the reference values for a slow/fast

9297

++ * non-rotational device. Finally, device_speed_thresh are the

9298

++ * thresholds used to switch between speed classes. The reference

9299

++ * rates are not the actual peak rates of the devices used as a

9300

++ * reference, but slightly lower values. The reason for using these

9301

++ * slightly lower values is that the peak-rate estimator tends to

9302

++ * yield slightly lower values than the actual peak rate (it can yield

9303

++ * the actual peak rate only if there is only one process doing I/O,

9304

++ * and the process does sequential I/O).

9305

++ *

9306

+  * Both the reference peak rates and the thresholds are measured in

9307

+  * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

9308

+  */

9309

+-static int R_slow[2] = {1536, 10752};

9310

+-static int R_fast[2] = {17415, 34791};

9311

++static int R_slow[2] = {1000, 10700};

9312

++static int R_fast[2] = {14000, 33000};

9313

+ /*

9314

+  * To improve readability, a conversion function is used to initialize the

9315

+  * following arrays, which entails that they can be initialized only in a

9316

+@@ -410,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)

9317

+  */

9318

+ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)

9319

+ {

9320

+-	return

9321

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

9322

+-		!bfqd->active_numerous_groups &&

9323

+-#endif

9324

+-		!bfq_differentiated_weights(bfqd);

9325

++	return !bfq_differentiated_weights(bfqd);

9326

+ }

9327

+

9328

+ /*

9329

+@@ -534,9 +538,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

9330

+ static unsigned long bfq_serv_to_charge(struct request *rq,

9331

+ 					struct bfq_queue *bfqq)

9332

+ {

9333

+-	return blk_rq_sectors(rq) *

9334

+-		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

9335

+-		bfq_async_charge_factor));

9336

++	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)

9337

++		return blk_rq_sectors(rq);

9338

++

9339

++	/*

9340

++	 * If there are no weight-raised queues, then amplify service

9341

++	 * by just the async charge factor; otherwise amplify service

9342

++	 * by twice the async charge factor, to further reduce latency

9343

++	 * for weight-raised queues.

9344

++	 */

9345

++	if (bfqq->bfqd->wr_busy_queues == 0)

9346

++		return blk_rq_sectors(rq) * bfq_async_charge_factor;

9347

++

9348

++	return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;

9349

+ }

9350

+

9351

+ /**

9352

+@@ -591,12 +605,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)

9353

+ 	dur = bfqd->RT_prod;

9354

+ 	do_div(dur, bfqd->peak_rate);

9355

+

9356

+-	return dur;

9357

+-}

9358

++	/*

9359

++	 * Limit duration between 3 and 13 seconds. Tests show that

9360

++	 * higher values than 13 seconds often yield the opposite of

9361

++	 * the desired result, i.e., worsen responsiveness by letting

9362

++	 * non-interactive and non-soft-real-time applications

9363

++	 * preserve weight raising for a too long time interval.

9364

++	 *

9365

++	 * On the other end, lower values than 3 seconds make it

9366

++	 * difficult for most interactive tasks to complete their jobs

9367

++	 * before weight-raising finishes.

9368

++	 */

9369

++	if (dur > msecs_to_jiffies(13000))

9370

++		dur = msecs_to_jiffies(13000);

9371

++	else if (dur < msecs_to_jiffies(3000))

9372

++		dur = msecs_to_jiffies(3000);

9373

+

9374

+-static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)

9375

+-{

9376

+-	return bfqq->bic ? bfqq->bic->cooperations : 0;

9377

++	return dur;

9378

+ }

9379

+

9380

+ static void

9381

+@@ -606,31 +631,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

9382

+ 		bfq_mark_bfqq_idle_window(bfqq);

9383

+ 	else

9384

+ 		bfq_clear_bfqq_idle_window(bfqq);

9385

++

9386

+ 	if (bic->saved_IO_bound)

9387

+ 		bfq_mark_bfqq_IO_bound(bfqq);

9388

+ 	else

9389

+ 		bfq_clear_bfqq_IO_bound(bfqq);

9390

+-	/* Assuming that the flag in_large_burst is already correctly set */

9391

+-	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

9392

+-	    !bfq_bfqq_in_large_burst(bfqq) &&

9393

+-	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

9394

+-		/*

9395

+-		 * Start a weight raising period with the duration given by

9396

+-		 * the raising_time_left snapshot.

9397

+-		 */

9398

+-		if (bfq_bfqq_busy(bfqq))

9399

+-			bfqq->bfqd->wr_busy_queues++;

9400

+-		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

9401

+-		bfqq->wr_cur_max_time = bic->wr_time_left;

9402

+-		bfqq->last_wr_start_finish = jiffies;

9403

+-		bfqq->entity.prio_changed = 1;

9404

+-	}

9405

+-	/*

9406

+-	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

9407

+-	 * getting confused about the queue's need of a weight-raising

9408

+-	 * period.

9409

+-	 */

9410

+-	bic->wr_time_left = 0;

9411

+ }

9412

+

9413

+ static int bfqq_process_refs(struct bfq_queue *bfqq)

9414

+@@ -640,7 +645,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)

9415

+ 	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);

9416

+

9417

+ 	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

9418

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

9419

++	process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;

9420

+ 	BUG_ON(process_refs < 0);

9421

+ 	return process_refs;

9422

+ }

9423

+@@ -655,6 +660,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9424

+ 		hlist_del_init(&item->burst_list_node);

9425

+ 	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

9426

+ 	bfqd->burst_size = 1;

9427

++	bfqd->burst_parent_entity = bfqq->entity.parent;

9428

+ }

9429

+

9430

+ /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

9431

+@@ -663,6 +669,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9432

+ 	/* Increment burst size to take into account also bfqq */

9433

+ 	bfqd->burst_size++;

9434

+

9435

++	bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);

9436

++

9437

++	BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);

9438

++

9439

+ 	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

9440

+ 		struct bfq_queue *pos, *bfqq_item;

9441

+ 		struct hlist_node *n;

9442

+@@ -672,15 +682,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9443

+ 		 * other to consider this burst as large.

9444

+ 		 */

9445

+ 		bfqd->large_burst = true;

9446

++		bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");

9447

+

9448

+ 		/*

9449

+ 		 * We can now mark all queues in the burst list as

9450

+ 		 * belonging to a large burst.

9451

+ 		 */

9452

+ 		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

9453

+-				     burst_list_node)

9454

++				     burst_list_node) {

9455

+ 		        bfq_mark_bfqq_in_large_burst(bfqq_item);

9456

++			bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");

9457

++		}

9458

+ 		bfq_mark_bfqq_in_large_burst(bfqq);

9459

++		bfq_log_bfqq(bfqd, bfqq, "marked in large burst");

9460

+

9461

+ 		/*

9462

+ 		 * From now on, and until the current burst finishes, any

9463

+@@ -692,67 +706,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9464

+ 		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

9465

+ 					  burst_list_node)

9466

+ 			hlist_del_init(&pos->burst_list_node);

9467

+-	} else /* burst not yet large: add bfqq to the burst list */

9468

++	} else /*

9469

++		* Burst not yet large: add bfqq to the burst list. Do

9470

++		* not increment the ref counter for bfqq, because bfqq

9471

++		* is removed from the burst list before freeing bfqq

9472

++		* in put_queue.

9473

++		*/

9474

+ 		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

9475

+ }

9476

+

9477

+ /*

9478

+- * If many queues happen to become active shortly after each other, then,

9479

+- * to help the processes associated to these queues get their job done as

9480

+- * soon as possible, it is usually better to not grant either weight-raising

9481

+- * or device idling to these queues. In this comment we describe, firstly,

9482

+- * the reasons why this fact holds, and, secondly, the next function, which

9483

+- * implements the main steps needed to properly mark these queues so that

9484

+- * they can then be treated in a different way.

9485

++ * If many queues belonging to the same group happen to be created

9486

++ * shortly after each other, then the processes associated with these

9487

++ * queues have typically a common goal. In particular, bursts of queue

9488

++ * creations are usually caused by services or applications that spawn

9489

++ * many parallel threads/processes. Examples are systemd during boot,

9490

++ * or git grep. To help these processes get their job done as soon as

9491

++ * possible, it is usually better to not grant either weight-raising

9492

++ * or device idling to their queues.

9493

+  *

9494

+- * As for the terminology, we say that a queue becomes active, i.e.,

9495

+- * switches from idle to backlogged, either when it is created (as a

9496

+- * consequence of the arrival of an I/O request), or, if already existing,

9497

+- * when a new request for the queue arrives while the queue is idle.

9498

+- * Bursts of activations, i.e., activations of different queues occurring

9499

+- * shortly after each other, are typically caused by services or applications

9500

+- * that spawn or reactivate many parallel threads/processes. Examples are

9501

+- * systemd during boot or git grep.

9502

++ * In this comment we describe, firstly, the reasons why this fact

9503

++ * holds, and, secondly, the next function, which implements the main

9504

++ * steps needed to properly mark these queues so that they can then be

9505

++ * treated in a different way.

9506

+  *

9507

+- * These services or applications benefit mostly from a high throughput:

9508

+- * the quicker the requests of the activated queues are cumulatively served,

9509

+- * the sooner the target job of these queues gets completed. As a consequence,

9510

+- * weight-raising any of these queues, which also implies idling the device

9511

+- * for it, is almost always counterproductive: in most cases it just lowers

9512

+- * throughput.

9513

++ * The above services or applications benefit mostly from a high

9514

++ * throughput: the quicker the requests of the activated queues are

9515

++ * cumulatively served, the sooner the target job of these queues gets

9516

++ * completed. As a consequence, weight-raising any of these queues,

9517

++ * which also implies idling the device for it, is almost always

9518

++ * counterproductive. In most cases it just lowers throughput.

9519

+  *

9520

+- * On the other hand, a burst of activations may be also caused by the start

9521

+- * of an application that does not consist in a lot of parallel I/O-bound

9522

+- * threads. In fact, with a complex application, the burst may be just a

9523

+- * consequence of the fact that several processes need to be executed to

9524

+- * start-up the application. To start an application as quickly as possible,

9525

+- * the best thing to do is to privilege the I/O related to the application

9526

+- * with respect to all other I/O. Therefore, the best strategy to start as

9527

+- * quickly as possible an application that causes a burst of activations is

9528

+- * to weight-raise all the queues activated during the burst. This is the

9529

++ * On the other hand, a burst of queue creations may be caused also by

9530

++ * the start of an application that does not consist of a lot of

9531

++ * parallel I/O-bound threads. In fact, with a complex application,

9532

++ * several short processes may need to be executed to start-up the

9533

++ * application. In this respect, to start an application as quickly as

9534

++ * possible, the best thing to do is in any case to privilege the I/O

9535

++ * related to the application with respect to all other

9536

++ * I/O. Therefore, the best strategy to start as quickly as possible

9537

++ * an application that causes a burst of queue creations is to

9538

++ * weight-raise all the queues created during the burst. This is the

9539

+  * exact opposite of the best strategy for the other type of bursts.

9540

+  *

9541

+- * In the end, to take the best action for each of the two cases, the two

9542

+- * types of bursts need to be distinguished. Fortunately, this seems

9543

+- * relatively easy to do, by looking at the sizes of the bursts. In

9544

+- * particular, we found a threshold such that bursts with a larger size

9545

+- * than that threshold are apparently caused only by services or commands

9546

+- * such as systemd or git grep. For brevity, hereafter we call just 'large'

9547

+- * these bursts. BFQ *does not* weight-raise queues whose activations occur

9548

+- * in a large burst. In addition, for each of these queues BFQ performs or

9549

+- * does not perform idling depending on which choice boosts the throughput

9550

+- * most. The exact choice depends on the device and request pattern at

9551

++ * In the end, to take the best action for each of the two cases, the

9552

++ * two types of bursts need to be distinguished. Fortunately, this

9553

++ * seems relatively easy, by looking at the sizes of the bursts. In

9554

++ * particular, we found a threshold such that only bursts with a

9555

++ * larger size than that threshold are apparently caused by

9556

++ * services or commands such as systemd or git grep. For brevity,

9557

++ * hereafter we call just 'large' these bursts. BFQ *does not*

9558

++ * weight-raise queues whose creation occurs in a large burst. In

9559

++ * addition, for each of these queues BFQ performs or does not perform

9560

++ * idling depending on which choice boosts the throughput more. The

9561

++ * exact choice depends on the device and request pattern at

9562

+  * hand.

9563

+  *

9564

+- * Turning back to the next function, it implements all the steps needed

9565

+- * to detect the occurrence of a large burst and to properly mark all the

9566

+- * queues belonging to it (so that they can then be treated in a different

9567

+- * way). This goal is achieved by maintaining a special "burst list" that

9568

+- * holds, temporarily, the queues that belong to the burst in progress. The

9569

+- * list is then used to mark these queues as belonging to a large burst if

9570

+- * the burst does become large. The main steps are the following.

9571

++ * Unfortunately, false positives may occur while an interactive task

9572

++ * is starting (e.g., an application is being started). The

9573

++ * consequence is that the queues associated with the task do not

9574

++ * enjoy weight raising as expected. Fortunately these false positives

9575

++ * are very rare. They typically occur if some service happens to

9576

++ * start doing I/O exactly when the interactive task starts.

9577

++ *

9578

++ * Turning back to the next function, it implements all the steps

9579

++ * needed to detect the occurrence of a large burst and to properly

9580

++ * mark all the queues belonging to it (so that they can then be

9581

++ * treated in a different way). This goal is achieved by maintaining a

9582

++ * "burst list" that holds, temporarily, the queues that belong to the

9583

++ * burst in progress. The list is then used to mark these queues as

9584

++ * belonging to a large burst if the burst does become large. The main

9585

++ * steps are the following.

9586

+  *

9587

+- * . when the very first queue is activated, the queue is inserted into the

9588

++ * . when the very first queue is created, the queue is inserted into the

9589

+  *   list (as it could be the first queue in a possible burst)

9590

+  *

9591

+  * . if the current burst has not yet become large, and a queue Q that does

9592

+@@ -773,13 +799,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9593

+  *

9594

+  *     . the device enters a large-burst mode

9595

+  *

9596

+- * . if a queue Q that does not belong to the burst is activated while

9597

++ * . if a queue Q that does not belong to the burst is created while

9598

+  *   the device is in large-burst mode and shortly after the last time

9599

+  *   at which a queue either entered the burst list or was marked as

9600

+  *   belonging to the current large burst, then Q is immediately marked

9601

+  *   as belonging to a large burst.

9602

+  *

9603

+- * . if a queue Q that does not belong to the burst is activated a while

9604

++ * . if a queue Q that does not belong to the burst is created a while

9605

+  *   later, i.e., not shortly after, than the last time at which a queue

9606

+  *   either entered the burst list or was marked as belonging to the

9607

+  *   current large burst, then the current burst is deemed as finished and:

9608

+@@ -792,52 +818,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9609

+  *          in a possible new burst (then the burst list contains just Q

9610

+  *          after this step).

9611

+  */

9612

+-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9613

+-			     bool idle_for_long_time)

9614

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9615

+ {

9616

+ 	/*

9617

+-	 * If bfqq happened to be activated in a burst, but has been idle

9618

+-	 * for at least as long as an interactive queue, then we assume

9619

+-	 * that, in the overall I/O initiated in the burst, the I/O

9620

+-	 * associated to bfqq is finished. So bfqq does not need to be

9621

+-	 * treated as a queue belonging to a burst anymore. Accordingly,

9622

+-	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

9623

+-	 * from the burst list if it's there. We do not decrement instead

9624

+-	 * burst_size, because the fact that bfqq does not need to belong

9625

+-	 * to the burst list any more does not invalidate the fact that

9626

+-	 * bfqq may have been activated during the current burst.

9627

+-	 */

9628

+-	if (idle_for_long_time) {

9629

+-		hlist_del_init(&bfqq->burst_list_node);

9630

+-		bfq_clear_bfqq_in_large_burst(bfqq);

9631

+-	}

9632

+-

9633

+-	/*

9634

+ 	 * If bfqq is already in the burst list or is part of a large

9635

+-	 * burst, then there is nothing else to do.

9636

++	 * burst, or finally has just been split, then there is

9637

++	 * nothing else to do.

9638

+ 	 */

9639

+ 	if (!hlist_unhashed(&bfqq->burst_list_node) ||

9640

+-	    bfq_bfqq_in_large_burst(bfqq))

9641

++	    bfq_bfqq_in_large_burst(bfqq) ||

9642

++	    time_is_after_eq_jiffies(bfqq->split_time +

9643

++				     msecs_to_jiffies(10)))

9644

+ 		return;

9645

+

9646

+ 	/*

9647

+-	 * If bfqq's activation happens late enough, then the current

9648

+-	 * burst is finished, and related data structures must be reset.

9649

++	 * If bfqq's creation happens late enough, or bfqq belongs to

9650

++	 * a different group than the burst group, then the current

9651

++	 * burst is finished, and related data structures must be

9652

++	 * reset.

9653

+ 	 *

9654

+-	 * In this respect, consider the special case where bfqq is the very

9655

+-	 * first queue being activated. In this case, last_ins_in_burst is

9656

+-	 * not yet significant when we get here. But it is easy to verify

9657

+-	 * that, whether or not the following condition is true, bfqq will

9658

+-	 * end up being inserted into the burst list. In particular the

9659

+-	 * list will happen to contain only bfqq. And this is exactly what

9660

+-	 * has to happen, as bfqq may be the first queue in a possible

9661

++	 * In this respect, consider the special case where bfqq is

9662

++	 * the very first queue created after BFQ is selected for this

9663

++	 * device. In this case, last_ins_in_burst and

9664

++	 * burst_parent_entity are not yet significant when we get

9665

++	 * here. But it is easy to verify that, whether or not the

9666

++	 * following condition is true, bfqq will end up being

9667

++	 * inserted into the burst list. In particular the list will

9668

++	 * happen to contain only bfqq. And this is exactly what has

9669

++	 * to happen, as bfqq may be the first queue of the first

9670

+ 	 * burst.

9671

+ 	 */

9672

+ 	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

9673

+-	    bfqd->bfq_burst_interval)) {

9674

++	    bfqd->bfq_burst_interval) ||

9675

++	    bfqq->entity.parent != bfqd->burst_parent_entity) {

9676

+ 		bfqd->large_burst = false;

9677

+ 		bfq_reset_burst_list(bfqd, bfqq);

9678

+-		return;

9679

++		bfq_log_bfqq(bfqd, bfqq,

9680

++			"handle_burst: late activation or different group");

9681

++		goto end;

9682

+ 	}

9683

+

9684

+ 	/*

9685

+@@ -846,8 +864,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9686

+ 	 * bfqq as belonging to this large burst immediately.

9687

+ 	 */

9688

+ 	if (bfqd->large_burst) {

9689

++		bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");

9690

+ 		bfq_mark_bfqq_in_large_burst(bfqq);

9691

+-		return;

9692

++		goto end;

9693

+ 	}

9694

+

9695

+ 	/*

9696

+@@ -856,25 +875,492 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9697

+ 	 * queue. Then we add bfqq to the burst.

9698

+ 	 */

9699

+ 	bfq_add_to_burst(bfqd, bfqq);

9700

++end:

9701

++	/*

9702

++	 * At this point, bfqq either has been added to the current

9703

++	 * burst or has caused the current burst to terminate and a

9704

++	 * possible new burst to start. In particular, in the second

9705

++	 * case, bfqq has become the first queue in the possible new

9706

++	 * burst.  In both cases last_ins_in_burst needs to be moved

9707

++	 * forward.

9708

++	 */

9709

++	bfqd->last_ins_in_burst = jiffies;

9710

++

9711

++}

9712

++

9713

++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)

9714

++{

9715

++	struct bfq_entity *entity = &bfqq->entity;

9716

++	return entity->budget - entity->service;

9717

++}

9718

++

9719

++/*

9720

++ * If enough samples have been computed, return the current max budget

9721

++ * stored in bfqd, which is dynamically updated according to the

9722

++ * estimated disk peak rate; otherwise return the default max budget

9723

++ */

9724

++static int bfq_max_budget(struct bfq_data *bfqd)

9725

++{

9726

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

9727

++		return bfq_default_max_budget;

9728

++	else

9729

++		return bfqd->bfq_max_budget;

9730

++}

9731

++

9732

++/*

9733

++ * Return min budget, which is a fraction of the current or default

9734

++ * max budget (trying with 1/32)

9735

++ */

9736

++static int bfq_min_budget(struct bfq_data *bfqd)

9737

++{

9738

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

9739

++		return bfq_default_max_budget / 32;

9740

++	else

9741

++		return bfqd->bfq_max_budget / 32;

9742

++}

9743

++

9744

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

9745

++			    struct bfq_queue *bfqq,

9746

++			    bool compensate,

9747

++			    enum bfqq_expiration reason);

9748

++

9749

++/*

9750

++ * The next function, invoked after the input queue bfqq switches from

9751

++ * idle to busy, updates the budget of bfqq. The function also tells

9752

++ * whether the in-service queue should be expired, by returning

9753

++ * true. The purpose of expiring the in-service queue is to give bfqq

9754

++ * the chance to possibly preempt the in-service queue, and the reason

9755

++ * for preempting the in-service queue is to achieve one of the two

9756

++ * goals below.

9757

++ *

9758

++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has

9759

++ * expired because it has remained idle. In particular, bfqq may have

9760

++ * expired for one of the following two reasons:

9761

++ *

9762

++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and

9763

++ *   did not make it to issue a new request before its last request

9764

++ *   was served;

9765

++ *

9766

++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue

9767

++ *   a new request before the expiration of the idling-time.

9768

++ *

9769

++ * Even if bfqq has expired for one of the above reasons, the process

9770

++ * associated with the queue may be however issuing requests greedily,

9771

++ * and thus be sensitive to the bandwidth it receives (bfqq may have

9772

++ * remained idle for other reasons: CPU high load, bfqq not enjoying

9773

++ * idling, I/O throttling somewhere in the path from the process to

9774

++ * the I/O scheduler, ...). But if, after every expiration for one of

9775

++ * the above two reasons, bfqq has to wait for the service of at least

9776

++ * one full budget of another queue before being served again, then

9777

++ * bfqq is likely to get a much lower bandwidth or resource time than

9778

++ * its reserved ones. To address this issue, two countermeasures need

9779

++ * to be taken.

9780

++ *

9781

++ * First, the budget and the timestamps of bfqq need to be updated in

9782

++ * a special way on bfqq reactivation: they need to be updated as if

9783

++ * bfqq did not remain idle and did not expire. In fact, if they are

9784

++ * computed as if bfqq expired and remained idle until reactivation,

9785

++ * then the process associated with bfqq is treated as if, instead of

9786

++ * being greedy, it stopped issuing requests when bfqq remained idle,

9787

++ * and restarts issuing requests only on this reactivation. In other

9788

++ * words, the scheduler does not help the process recover the "service

9789

++ * hole" between bfqq expiration and reactivation. As a consequence,

9790

++ * the process receives a lower bandwidth than its reserved one. In

9791

++ * contrast, to recover this hole, the budget must be updated as if

9792

++ * bfqq was not expired at all before this reactivation, i.e., it must

9793

++ * be set to the value of the remaining budget when bfqq was

9794

++ * expired. Along the same line, timestamps need to be assigned the

9795

++ * value they had the last time bfqq was selected for service, i.e.,

9796

++ * before last expiration. Thus timestamps need to be back-shifted

9797

++ * with respect to their normal computation (see [1] for more details

9798

++ * on this tricky aspect).

9799

++ *

9800

++ * Secondly, to allow the process to recover the hole, the in-service

9801

++ * queue must be expired too, to give bfqq the chance to preempt it

9802

++ * immediately. In fact, if bfqq has to wait for a full budget of the

9803

++ * in-service queue to be completed, then it may become impossible to

9804

++ * let the process recover the hole, even if the back-shifted

9805

++ * timestamps of bfqq are lower than those of the in-service queue. If

9806

++ * this happens for most or all of the holes, then the process may not

9807

++ * receive its reserved bandwidth. In this respect, it is worth noting

9808

++ * that, being the service of outstanding requests unpreemptible, a

9809

++ * little fraction of the holes may however be unrecoverable, thereby

9810

++ * causing a little loss of bandwidth.

9811

++ *

9812

++ * The last important point is detecting whether bfqq does need this

9813

++ * bandwidth recovery. In this respect, the next function deems the

9814

++ * process associated with bfqq greedy, and thus allows it to recover

9815

++ * the hole, if: 1) the process is waiting for the arrival of a new

9816

++ * request (which implies that bfqq expired for one of the above two

9817

++ * reasons), and 2) such a request has arrived soon. The first

9818

++ * condition is controlled through the flag non_blocking_wait_rq,

9819

++ * while the second through the flag arrived_in_time. If both

9820

++ * conditions hold, then the function computes the budget in the

9821

++ * above-described special way, and signals that the in-service queue

9822

++ * should be expired. Timestamp back-shifting is done later in

9823

++ * __bfq_activate_entity.

9824

++ *

9825

++ * 2. Reduce latency. Even if timestamps are not backshifted to let

9826

++ * the process associated with bfqq recover a service hole, bfqq may

9827

++ * however happen to have, after being (re)activated, a lower finish

9828

++ * timestamp than the in-service queue.  That is, the next budget of

9829

++ * bfqq may have to be completed before the one of the in-service

9830

++ * queue. If this is the case, then preempting the in-service queue

9831

++ * allows this goal to be achieved, apart from the unpreemptible,

9832

++ * outstanding requests mentioned above.

9833

++ *

9834

++ * Unfortunately, regardless of which of the above two goals one wants

9835

++ * to achieve, service trees need first to be updated to know whether

9836

++ * the in-service queue must be preempted. To have service trees

9837

++ * correctly updated, the in-service queue must be expired and

9838

++ * rescheduled, and bfqq must be scheduled too. This is one of the

9839

++ * most costly operations (in future versions, the scheduling

9840

++ * mechanism may be re-designed in such a way to make it possible to

9841

++ * know whether preemption is needed without needing to update service

9842

++ * trees). In addition, queue preemptions almost always cause random

9843

++ * I/O, and thus loss of throughput. Because of these facts, the next

9844

++ * function adopts the following simple scheme to avoid both costly

9845

++ * operations and too frequent preemptions: it requests the expiration

9846

++ * of the in-service queue (unconditionally) only for queues that need

9847

++ * to recover a hole, or that either are weight-raised or deserve to

9848

++ * be weight-raised.

9849

++ */

9850

++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,

9851

++						struct bfq_queue *bfqq,

9852

++						bool arrived_in_time,

9853

++						bool wr_or_deserves_wr)

9854

++{

9855

++	struct bfq_entity *entity = &bfqq->entity;

9856

++

9857

++	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {

9858

++		/*

9859

++		 * We do not clear the flag non_blocking_wait_rq here, as

9860

++		 * the latter is used in bfq_activate_bfqq to signal

9861

++		 * that timestamps need to be back-shifted (and is

9862

++		 * cleared right after).

9863

++		 */

9864

++

9865

++		/*

9866

++		 * In next assignment we rely on that either

9867

++		 * entity->service or entity->budget are not updated

9868

++		 * on expiration if bfqq is empty (see

9869

++		 * __bfq_bfqq_recalc_budget). Thus both quantities

9870

++		 * remain unchanged after such an expiration, and the

9871

++		 * following statement therefore assigns to

9872

++		 * entity->budget the remaining budget on such an

9873

++		 * expiration. For clarity, entity->service is not

9874

++		 * updated on expiration in any case, and, in normal

9875

++		 * operation, is reset only when bfqq is selected for

9876

++		 * service (see bfq_get_next_queue).

9877

++		 */

9878

++		entity->budget = min_t(unsigned long,

9879

++				       bfq_bfqq_budget_left(bfqq),

9880

++				       bfqq->max_budget);

9881

++

9882

++		BUG_ON(entity->budget < 0);

9883

++		return true;

9884

++	}

9885

++

9886

++	entity->budget = max_t(unsigned long, bfqq->max_budget,

9887

++			       bfq_serv_to_charge(bfqq->next_rq,bfqq));

9888

++	BUG_ON(entity->budget < 0);

9889

++

9890

++	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);

9891

++	return wr_or_deserves_wr;

9892

++}

9893

++

9894

++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,

9895

++					     struct bfq_queue *bfqq,

9896

++					     unsigned int old_wr_coeff,

9897

++					     bool wr_or_deserves_wr,

9898

++					     bool interactive,

9899

++					     bool in_burst,

9900

++					     bool soft_rt)

9901

++{

9902

++	if (old_wr_coeff == 1 && wr_or_deserves_wr) {

9903

++		/* start a weight-raising period */

9904

++		bfqq->wr_coeff = bfqd->bfq_wr_coeff;

9905

++		if (interactive) /* update wr duration */

9906

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

9907

++		else

9908

++			bfqq->wr_cur_max_time =

9909

++				bfqd->bfq_wr_rt_max_time;

9910

++		/*

9911

++		 * If needed, further reduce budget to make sure it is

9912

++		 * close to bfqq's backlog, so as to reduce the

9913

++		 * scheduling-error component due to a too large

9914

++		 * budget. Do not care about throughput consequences,

9915

++		 * but only about latency. Finally, do not assign a

9916

++		 * too small budget either, to avoid increasing

9917

++		 * latency by causing too frequent expirations.

9918

++		 */

9919

++		bfqq->entity.budget = min_t(unsigned long,

9920

++					    bfqq->entity.budget,

9921

++					    2 * bfq_min_budget(bfqd));

9922

++

9923

++		bfq_log_bfqq(bfqd, bfqq,

9924

++			     "wrais starting at %lu, rais_max_time %u",

9925

++			     jiffies,

9926

++			     jiffies_to_msecs(bfqq->wr_cur_max_time));

9927

++	} else if (old_wr_coeff > 1) {

9928

++		if (interactive) /* update wr duration */

9929

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

9930

++		else if (in_burst) {

9931

++			bfqq->wr_coeff = 1;

9932

++			bfq_log_bfqq(bfqd, bfqq,

9933

++				     "wrais ending at %lu, rais_max_time %u",

9934

++				     jiffies,

9935

++				     jiffies_to_msecs(bfqq->

9936

++						      wr_cur_max_time));

9937

++		} else if (time_before(

9938

++				   bfqq->last_wr_start_finish +

9939

++				   bfqq->wr_cur_max_time,

9940

++				   jiffies +

9941

++				   bfqd->bfq_wr_rt_max_time) &&

9942

++			   soft_rt) {

9943

++			/*

9944

++			 * The remaining weight-raising time is lower

9945

++			 * than bfqd->bfq_wr_rt_max_time, which means

9946

++			 * that the application is enjoying weight

9947

++			 * raising either because deemed soft-rt in

9948

++			 * the near past, or because deemed interactive

9949

++			 * a long ago.

9950

++			 * In both cases, resetting now the current

9951

++			 * remaining weight-raising time for the

9952

++			 * application to the weight-raising duration

9953

++			 * for soft rt applications would not cause any

9954

++			 * latency increase for the application (as the

9955

++			 * new duration would be higher than the

9956

++			 * remaining time).

9957

++			 *

9958

++			 * In addition, the application is now meeting

9959

++			 * the requirements for being deemed soft rt.

9960

++			 * In the end we can correctly and safely

9961

++			 * (re)charge the weight-raising duration for

9962

++			 * the application with the weight-raising

9963

++			 * duration for soft rt applications.

9964

++			 *

9965

++			 * In particular, doing this recharge now, i.e.,

9966

++			 * before the weight-raising period for the

9967

++			 * application finishes, reduces the probability

9968

++			 * of the following negative scenario:

9969

++			 * 1) the weight of a soft rt application is

9970

++			 *    raised at startup (as for any newly

9971

++			 *    created application),

9972

++			 * 2) since the application is not interactive,

9973

++			 *    at a certain time weight-raising is

9974

++			 *    stopped for the application,

9975

++			 * 3) at that time the application happens to

9976

++			 *    still have pending requests, and hence

9977

++			 *    is destined to not have a chance to be

9978

++			 *    deemed soft rt before these requests are

9979

++			 *    completed (see the comments to the

9980

++			 *    function bfq_bfqq_softrt_next_start()

9981

++			 *    for details on soft rt detection),

9982

++			 * 4) these pending requests experience a high

9983

++			 *    latency because the application is not

9984

++			 *    weight-raised while they are pending.

9985

++			 */

9986

++			bfqq->last_wr_start_finish = jiffies;

9987

++			bfqq->wr_cur_max_time =

9988

++				bfqd->bfq_wr_rt_max_time;

9989

++			bfq_log_bfqq(bfqd, bfqq,

9990

++				     "switching to soft_rt wr, or "

9991

++				     " just moving forward duration");

9992

++		}

9993

++	}

9994

++}

9995

++

9996

++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,

9997

++					struct bfq_queue *bfqq)

9998

++{

9999

++	return bfqq->dispatched == 0 &&

10000

++		time_is_before_jiffies(

10001

++			bfqq->budget_timeout +

10002

++			bfqd->bfq_wr_min_idle_time);

10003

++}

10004

++

10005

++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,

10006

++					     struct bfq_queue *bfqq,

10007

++					     int old_wr_coeff,

10008

++					     struct request *rq,

10009

++					     bool *interactive)

10010

++{

10011

++	bool soft_rt, in_burst,	wr_or_deserves_wr,

10012

++		bfqq_wants_to_preempt,

10013

++		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),

10014

++		/*

10015

++		 * See the comments on

10016

++		 * bfq_bfqq_update_budg_for_activation for

10017

++		 * details on the usage of the next variable.

10018

++		 */

10019

++		arrived_in_time = time_is_after_jiffies(

10020

++			RQ_BIC(rq)->ttime.last_end_request +

10021

++			bfqd->bfq_slice_idle * 3);

10022

++

10023

++	bfq_log_bfqq(bfqd, bfqq,

10024

++		     "bfq_add_request non-busy: "

10025

++		     "jiffies %lu, in_time %d, idle_long %d busyw %d "

10026

++		     "wr_coeff %u",

10027

++		     jiffies, arrived_in_time,

10028

++		     idle_for_long_time,

10029

++		     bfq_bfqq_non_blocking_wait_rq(bfqq),

10030

++		     old_wr_coeff);

10031

++

10032

++	BUG_ON(bfqq->entity.budget < bfqq->entity.service);

10033

++

10034

++	BUG_ON(bfqq == bfqd->in_service_queue);

10035

++	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,

10036

++				 rq->cmd_flags);

10037

++

10038

++	/*

10039

++	 * bfqq deserves to be weight-raised if:

10040

++	 * - it is sync,

10041

++	 * - it does not belong to a large burst,

10042

++	 * - it has been idle for enough time or is soft real-time,

10043

++	 * - is linked to a bfq_io_cq (it is not shared in any sense)

10044

++	 */

10045

++	in_burst = bfq_bfqq_in_large_burst(bfqq);

10046

++	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

10047

++		!in_burst &&

10048

++		time_is_before_jiffies(bfqq->soft_rt_next_start);

10049

++	*interactive =

10050

++		!in_burst &&

10051

++		idle_for_long_time;

10052

++	wr_or_deserves_wr = bfqd->low_latency &&

10053

++		(bfqq->wr_coeff > 1 ||

10054

++		 (bfq_bfqq_sync(bfqq) &&

10055

++		  bfqq->bic && (*interactive || soft_rt)));

10056

++

10057

++	bfq_log_bfqq(bfqd, bfqq,

10058

++		     "bfq_add_request: "

10059

++		     "in_burst %d, "

10060

++		     "soft_rt %d (next %lu), inter %d, bic %p",

10061

++		     bfq_bfqq_in_large_burst(bfqq), soft_rt,

10062

++		     bfqq->soft_rt_next_start,

10063

++		     *interactive,

10064

++		     bfqq->bic);

10065

++

10066

++	/*

10067

++	 * Using the last flag, update budget and check whether bfqq

10068

++	 * may want to preempt the in-service queue.

10069

++	 */

10070

++	bfqq_wants_to_preempt =

10071

++		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,

10072

++						    arrived_in_time,

10073

++						    wr_or_deserves_wr);

10074

++

10075

++	/*

10076

++	 * If bfqq happened to be activated in a burst, but has been

10077

++	 * idle for much more than an interactive queue, then we

10078

++	 * assume that, in the overall I/O initiated in the burst, the

10079

++	 * I/O associated with bfqq is finished. So bfqq does not need

10080

++	 * to be treated as a queue belonging to a burst

10081

++	 * anymore. Accordingly, we reset bfqq's in_large_burst flag

10082

++	 * if set, and remove bfqq from the burst list if it's

10083

++	 * there. We do not decrement burst_size, because the fact

10084

++	 * that bfqq does not need to belong to the burst list any

10085

++	 * more does not invalidate the fact that bfqq was created in

10086

++	 * a burst.

10087

++	 */

10088

++	if (likely(!bfq_bfqq_just_created(bfqq)) &&

10089

++	    idle_for_long_time &&

10090

++	    time_is_before_jiffies(

10091

++		    bfqq->budget_timeout +

10092

++		    msecs_to_jiffies(10000))) {

10093

++		hlist_del_init(&bfqq->burst_list_node);

10094

++		bfq_clear_bfqq_in_large_burst(bfqq);

10095

++	}

10096

++

10097

++	bfq_clear_bfqq_just_created(bfqq);

10098

++

10099

++	if (!bfq_bfqq_IO_bound(bfqq)) {

10100

++		if (arrived_in_time) {

10101

++			bfqq->requests_within_timer++;

10102

++			if (bfqq->requests_within_timer >=

10103

++			    bfqd->bfq_requests_within_timer)

10104

++				bfq_mark_bfqq_IO_bound(bfqq);

10105

++		} else

10106

++			bfqq->requests_within_timer = 0;

10107

++		bfq_log_bfqq(bfqd, bfqq, "requests in time %d",

10108

++			     bfqq->requests_within_timer);

10109

++	}

10110

++

10111

++	if (bfqd->low_latency) {

10112

++		if (unlikely(time_is_after_jiffies(bfqq->split_time)))

10113

++			/* wraparound */

10114

++			bfqq->split_time =

10115

++				jiffies - bfqd->bfq_wr_min_idle_time - 1;

10116

++

10117

++		if (time_is_before_jiffies(bfqq->split_time +

10118

++					   bfqd->bfq_wr_min_idle_time)) {

10119

++			bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,

10120

++							 old_wr_coeff,

10121

++							 wr_or_deserves_wr,

10122

++							 *interactive,

10123

++							 in_burst,

10124

++							 soft_rt);

10125

++

10126

++			if (old_wr_coeff != bfqq->wr_coeff)

10127

++				bfqq->entity.prio_changed = 1;

10128

++		}

10129

++	}

10130

++

10131

++	bfqq->last_idle_bklogged = jiffies;

10132

++	bfqq->service_from_backlogged = 0;

10133

++	bfq_clear_bfqq_softrt_update(bfqq);

10134

++

10135

++	bfq_add_bfqq_busy(bfqd, bfqq);

10136

++

10137

++	/*

10138

++	 * Expire in-service queue only if preemption may be needed

10139

++	 * for guarantees. In this respect, the function

10140

++	 * next_queue_may_preempt just checks a simple, necessary

10141

++	 * condition, and not a sufficient condition based on

10142

++	 * timestamps. In fact, for the latter condition to be

10143

++	 * evaluated, timestamps would need first to be updated, and

10144

++	 * this operation is quite costly (see the comments on the

10145

++	 * function bfq_bfqq_update_budg_for_activation).

10146

++	 */

10147

++	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&

10148

++	    bfqd->in_service_queue->wr_coeff == 1 &&

10149

++	    next_queue_may_preempt(bfqd)) {

10150

++		struct bfq_queue *in_serv =

10151

++			bfqd->in_service_queue;

10152

++		BUG_ON(in_serv == bfqq);

10153

++

10154

++		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,

10155

++				false, BFQ_BFQQ_PREEMPTED);

10156

++		BUG_ON(in_serv->entity.budget < 0);

10157

++	}

10158

+ }

10159

+

10160

+ static void bfq_add_request(struct request *rq)

10161

+ {

10162

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

10163

+-	struct bfq_entity *entity = &bfqq->entity;

10164

+ 	struct bfq_data *bfqd = bfqq->bfqd;

10165

+ 	struct request *next_rq, *prev;

10166

+-	unsigned long old_wr_coeff = bfqq->wr_coeff;

10167

++	unsigned int old_wr_coeff = bfqq->wr_coeff;

10168

+ 	bool interactive = false;

10169

+

10170

+-	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

10171

++	bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",

10172

++		     blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");

10173

++

10174

++	if (bfqq->wr_coeff > 1) /* queue is being weight-raised */

10175

++		bfq_log_bfqq(bfqd, bfqq,

10176

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

10177

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

10178

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

10179

++			bfqq->wr_coeff,

10180

++			bfqq->entity.weight, bfqq->entity.orig_weight);

10181

++

10182

+ 	bfqq->queued[rq_is_sync(rq)]++;

10183

+ 	bfqd->queued++;

10184

+

10185

+ 	elv_rb_add(&bfqq->sort_list, rq);

10186

+

10187

+ 	/*

10188

+-	 * Check if this request is a better next-serve candidate.

10189

++	 * Check if this request is a better next-to-serve candidate.

10190

+ 	 */

10191

+ 	prev = bfqq->next_rq;

10192

+ 	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

10193

+@@ -887,160 +1373,10 @@ static void bfq_add_request(struct request *rq)

10194

+ 	if (prev != bfqq->next_rq)

10195

+ 		bfq_pos_tree_add_move(bfqd, bfqq);

10196

+

10197

+-	if (!bfq_bfqq_busy(bfqq)) {

10198

+-		bool soft_rt, coop_or_in_burst,

10199

+-		     idle_for_long_time = time_is_before_jiffies(

10200

+-						bfqq->budget_timeout +

10201

+-						bfqd->bfq_wr_min_idle_time);

10202

+-

10203

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10204

+-		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,

10205

+-					 rq->cmd_flags);

10206

+-#endif

10207

+-		if (bfq_bfqq_sync(bfqq)) {

10208

+-			bool already_in_burst =

10209

+-			   !hlist_unhashed(&bfqq->burst_list_node) ||

10210

+-			   bfq_bfqq_in_large_burst(bfqq);

10211

+-			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

10212

+-			/*

10213

+-			 * If bfqq was not already in the current burst,

10214

+-			 * then, at this point, bfqq either has been

10215

+-			 * added to the current burst or has caused the

10216

+-			 * current burst to terminate. In particular, in

10217

+-			 * the second case, bfqq has become the first

10218

+-			 * queue in a possible new burst.

10219

+-			 * In both cases last_ins_in_burst needs to be

10220

+-			 * moved forward.

10221

+-			 */

10222

+-			if (!already_in_burst)

10223

+-				bfqd->last_ins_in_burst = jiffies;

10224

+-		}

10225

+-

10226

+-		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

10227

+-			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

10228

+-		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

10229

+-			!coop_or_in_burst &&

10230

+-			time_is_before_jiffies(bfqq->soft_rt_next_start);

10231

+-		interactive = !coop_or_in_burst && idle_for_long_time;

10232

+-		entity->budget = max_t(unsigned long, bfqq->max_budget,

10233

+-				       bfq_serv_to_charge(next_rq, bfqq));

10234

+-

10235

+-		if (!bfq_bfqq_IO_bound(bfqq)) {

10236

+-			if (time_before(jiffies,

10237

+-					RQ_BIC(rq)->ttime.last_end_request +

10238

+-					bfqd->bfq_slice_idle)) {

10239

+-				bfqq->requests_within_timer++;

10240

+-				if (bfqq->requests_within_timer >=

10241

+-				    bfqd->bfq_requests_within_timer)

10242

+-					bfq_mark_bfqq_IO_bound(bfqq);

10243

+-			} else

10244

+-				bfqq->requests_within_timer = 0;

10245

+-		}

10246

+-

10247

+-		if (!bfqd->low_latency)

10248

+-			goto add_bfqq_busy;

10249

+-

10250

+-		if (bfq_bfqq_just_split(bfqq))

10251

+-			goto set_prio_changed;

10252

+-

10253

+-		/*

10254

+-		 * If the queue:

10255

+-		 * - is not being boosted,

10256

+-		 * - has been idle for enough time,

10257

+-		 * - is not a sync queue or is linked to a bfq_io_cq (it is

10258

+-		 *   shared "for its nature" or it is not shared and its

10259

+-		 *   requests have not been redirected to a shared queue)

10260

+-		 * start a weight-raising period.

10261

+-		 */

10262

+-		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

10263

+-		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {

10264

+-			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

10265

+-			if (interactive)

10266

+-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

10267

+-			else

10268

+-				bfqq->wr_cur_max_time =

10269

+-					bfqd->bfq_wr_rt_max_time;

10270

+-			bfq_log_bfqq(bfqd, bfqq,

10271

+-				     "wrais starting at %lu, rais_max_time %u",

10272

+-				     jiffies,

10273

+-				     jiffies_to_msecs(bfqq->wr_cur_max_time));

10274

+-		} else if (old_wr_coeff > 1) {

10275

+-			if (interactive)

10276

+-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

10277

+-			else if (coop_or_in_burst ||

10278

+-				 (bfqq->wr_cur_max_time ==

10279

+-				  bfqd->bfq_wr_rt_max_time &&

10280

+-				  !soft_rt)) {

10281

+-				bfqq->wr_coeff = 1;

10282

+-				bfq_log_bfqq(bfqd, bfqq,

10283

+-					"wrais ending at %lu, rais_max_time %u",

10284

+-					jiffies,

10285

+-					jiffies_to_msecs(bfqq->

10286

+-						wr_cur_max_time));

10287

+-			} else if (time_before(

10288

+-					bfqq->last_wr_start_finish +

10289

+-					bfqq->wr_cur_max_time,

10290

+-					jiffies +

10291

+-					bfqd->bfq_wr_rt_max_time) &&

10292

+-				   soft_rt) {

10293

+-				/*

10294

+-				 *

10295

+-				 * The remaining weight-raising time is lower

10296

+-				 * than bfqd->bfq_wr_rt_max_time, which means

10297

+-				 * that the application is enjoying weight

10298

+-				 * raising either because deemed soft-rt in

10299

+-				 * the near past, or because deemed interactive

10300

+-				 * a long ago.

10301

+-				 * In both cases, resetting now the current

10302

+-				 * remaining weight-raising time for the

10303

+-				 * application to the weight-raising duration

10304

+-				 * for soft rt applications would not cause any

10305

+-				 * latency increase for the application (as the

10306

+-				 * new duration would be higher than the

10307

+-				 * remaining time).

10308

+-				 *

10309

+-				 * In addition, the application is now meeting

10310

+-				 * the requirements for being deemed soft rt.

10311

+-				 * In the end we can correctly and safely

10312

+-				 * (re)charge the weight-raising duration for

10313

+-				 * the application with the weight-raising

10314

+-				 * duration for soft rt applications.

10315

+-				 *

10316

+-				 * In particular, doing this recharge now, i.e.,

10317

+-				 * before the weight-raising period for the

10318

+-				 * application finishes, reduces the probability

10319

+-				 * of the following negative scenario:

10320

+-				 * 1) the weight of a soft rt application is

10321

+-				 *    raised at startup (as for any newly

10322

+-				 *    created application),

10323

+-				 * 2) since the application is not interactive,

10324

+-				 *    at a certain time weight-raising is

10325

+-				 *    stopped for the application,

10326

+-				 * 3) at that time the application happens to

10327

+-				 *    still have pending requests, and hence

10328

+-				 *    is destined to not have a chance to be

10329

+-				 *    deemed soft rt before these requests are

10330

+-				 *    completed (see the comments to the

10331

+-				 *    function bfq_bfqq_softrt_next_start()

10332

+-				 *    for details on soft rt detection),

10333

+-				 * 4) these pending requests experience a high

10334

+-				 *    latency because the application is not

10335

+-				 *    weight-raised while they are pending.

10336

+-				 */

10337

+-				bfqq->last_wr_start_finish = jiffies;

10338

+-				bfqq->wr_cur_max_time =

10339

+-					bfqd->bfq_wr_rt_max_time;

10340

+-			}

10341

+-		}

10342

+-set_prio_changed:

10343

+-		if (old_wr_coeff != bfqq->wr_coeff)

10344

+-			entity->prio_changed = 1;

10345

+-add_bfqq_busy:

10346

+-		bfqq->last_idle_bklogged = jiffies;

10347

+-		bfqq->service_from_backlogged = 0;

10348

+-		bfq_clear_bfqq_softrt_update(bfqq);

10349

+-		bfq_add_bfqq_busy(bfqd, bfqq);

10350

+-	} else {

10351

++	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */

10352

++		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,

10353

++						 rq, &interactive);

10354

++	else {

10355

+ 		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

10356

+ 		    time_is_before_jiffies(

10357

+ 				bfqq->last_wr_start_finish +

10358

+@@ -1049,16 +1385,43 @@ add_bfqq_busy:

10359

+ 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

10360

+

10361

+ 			bfqd->wr_busy_queues++;

10362

+-			entity->prio_changed = 1;

10363

++			bfqq->entity.prio_changed = 1;

10364

+ 			bfq_log_bfqq(bfqd, bfqq,

10365

+-			    "non-idle wrais starting at %lu, rais_max_time %u",

10366

+-			    jiffies,

10367

+-			    jiffies_to_msecs(bfqq->wr_cur_max_time));

10368

++				     "non-idle wrais starting, "

10369

++				     "wr_max_time %u wr_busy %d",

10370

++				     jiffies_to_msecs(bfqq->wr_cur_max_time),

10371

++				     bfqd->wr_busy_queues);

10372

+ 		}

10373

+ 		if (prev != bfqq->next_rq)

10374

+ 			bfq_updated_next_req(bfqd, bfqq);

10375

+ 	}

10376

+

10377

++	/*

10378

++	 * Assign jiffies to last_wr_start_finish in the following

10379

++	 * cases:

10380

++	 *

10381

++	 * . if bfqq is not going to be weight-raised, because, for

10382

++	 *   non weight-raised queues, last_wr_start_finish stores the

10383

++	 *   arrival time of the last request; as of now, this piece

10384

++	 *   of information is used only for deciding whether to

10385

++	 *   weight-raise async queues

10386

++	 *

10387

++	 * . if bfqq is not weight-raised, because, if bfqq is now

10388

++	 *   switching to weight-raised, then last_wr_start_finish

10389

++	 *   stores the time when weight-raising starts

10390

++	 *

10391

++	 * . if bfqq is interactive, because, regardless of whether

10392

++	 *   bfqq is currently weight-raised, the weight-raising

10393

++	 *   period must start or restart (this case is considered

10394

++	 *   separately because it is not detected by the above

10395

++	 *   conditions, if bfqq is already weight-raised)

10396

++	 *

10397

++	 * last_wr_start_finish has to be updated also if bfqq is soft

10398

++	 * real-time, because the weight-raising period is constantly

10399

++	 * restarted on idle-to-busy transitions for these queues, but

10400

++	 * this is already done in bfq_bfqq_handle_idle_busy_switch if

10401

++	 * needed.

10402

++	 */

10403

+ 	if (bfqd->low_latency &&

10404

+ 		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

10405

+ 		bfqq->last_wr_start_finish = jiffies;

10406

+@@ -1106,6 +1469,9 @@ static void bfq_remove_request(struct request *rq)

10407

+ 	struct bfq_data *bfqd = bfqq->bfqd;

10408

+ 	const int sync = rq_is_sync(rq);

10409

+

10410

++	BUG_ON(bfqq->entity.service > bfqq->entity.budget &&

10411

++	       bfqq == bfqd->in_service_queue);

10412

++

10413

+ 	if (bfqq->next_rq == rq) {

10414

+ 		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

10415

+ 		bfq_updated_next_req(bfqd, bfqq);

10416

+@@ -1119,8 +1485,25 @@ static void bfq_remove_request(struct request *rq)

10417

+ 	elv_rb_del(&bfqq->sort_list, rq);

10418

+

10419

+ 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

10420

+-		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

10421

++		BUG_ON(bfqq->entity.budget < 0);

10422

++

10423

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {

10424

+ 			bfq_del_bfqq_busy(bfqd, bfqq, 1);

10425

++

10426

++			/* bfqq emptied. In normal operation, when

10427

++			 * bfqq is empty, bfqq->entity.service and

10428

++			 * bfqq->entity.budget must contain,

10429

++			 * respectively, the service received and the

10430

++			 * budget used last time bfqq emptied. These

10431

++			 * facts do not hold in this case, as at least

10432

++			 * this last removal occurred while bfqq is

10433

++			 * not in service. To avoid inconsistencies,

10434

++			 * reset both bfqq->entity.service and

10435

++			 * bfqq->entity.budget.

10436

++			 */

10437

++			bfqq->entity.budget = bfqq->entity.service = 0;

10438

++		}

10439

++

10440

+ 		/*

10441

+ 		 * Remove queue from request-position tree as it is empty.

10442

+ 		 */

10443

+@@ -1134,9 +1517,7 @@ static void bfq_remove_request(struct request *rq)

10444

+ 		BUG_ON(bfqq->meta_pending == 0);

10445

+ 		bfqq->meta_pending--;

10446

+ 	}

10447

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10448

+ 	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);

10449

+-#endif

10450

+ }

10451

+

10452

+ static int bfq_merge(struct request_queue *q, struct request **req,

10453

+@@ -1221,21 +1602,25 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,

10454

+ 		bfqq->next_rq = rq;

10455

+

10456

+ 	bfq_remove_request(next);

10457

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10458

+ 	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);

10459

+-#endif

10460

+ }

10461

+

10462

+ /* Must be called with bfqq != NULL */

10463

+ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

10464

+ {

10465

+ 	BUG_ON(!bfqq);

10466

++

10467

+ 	if (bfq_bfqq_busy(bfqq))

10468

+ 		bfqq->bfqd->wr_busy_queues--;

10469

+ 	bfqq->wr_coeff = 1;

10470

+ 	bfqq->wr_cur_max_time = 0;

10471

+-	/* Trigger a weight change on the next activation of the queue */

10472

++	/*

10473

++	 * Trigger a weight change on the next invocation of

10474

++	 * __bfq_entity_update_weight_prio.

10475

++	 */

10476

+ 	bfqq->entity.prio_changed = 1;

10477

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",

10478

++		     bfqq->bfqd->wr_busy_queues);

10479

+ }

10480

+

10481

+ static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

10482

+@@ -1278,7 +1663,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request,

10483

+ 				  sector_t sector)

10484

+ {

10485

+ 	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=

10486

+-	       BFQQ_SEEK_THR;

10487

++	       BFQQ_CLOSE_THR;

10488

+ }

10489

+

10490

+ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,

10491

+@@ -1400,7 +1785,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

10492

+ 	 * throughput.

10493

+ 	 */

10494

+ 	bfqq->new_bfqq = new_bfqq;

10495

+-	atomic_add(process_refs, &new_bfqq->ref);

10496

++	new_bfqq->ref += process_refs;

10497

+ 	return new_bfqq;

10498

+ }

10499

+

10500

+@@ -1431,9 +1816,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,

10501

+ }

10502

+

10503

+ /*

10504

+- * Attempt to schedule a merge of bfqq with the currently in-service queue

10505

+- * or with a close queue among the scheduled queues.

10506

+- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

10507

++ * If this function returns true, then bfqq cannot be merged. The idea

10508

++ * is that true cooperation happens very early after processes start

10509

++ * to do I/O. Usually, late cooperations are just accidental false

10510

++ * positives. In case bfqq is weight-raised, such false positives

10511

++ * would evidently degrade latency guarantees for bfqq.

10512

++ */

10513

++bool wr_from_too_long(struct bfq_queue *bfqq)

10514

++{

10515

++	return bfqq->wr_coeff > 1 &&

10516

++		time_is_before_jiffies(bfqq->last_wr_start_finish +

10517

++				       msecs_to_jiffies(100));

10518

++}

10519

++

10520

++/*

10521

++ * Attempt to schedule a merge of bfqq with the currently in-service

10522

++ * queue or with a close queue among the scheduled queues.  Return

10523

++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue

10524

+  * structure otherwise.

10525

+  *

10526

+  * The OOM queue is not allowed to participate to cooperation: in fact, since

10527

+@@ -1442,6 +1841,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,

10528

+  * handle merging with the OOM queue would be quite complex and expensive

10529

+  * to maintain. Besides, in such a critical condition as an out of memory,

10530

+  * the benefits of queue merging may be little relevant, or even negligible.

10531

++ *

10532

++ * Weight-raised queues can be merged only if their weight-raising

10533

++ * period has just started. In fact cooperating processes are usually

10534

++ * started together. Thus, with this filter we avoid false positives

10535

++ * that would jeopardize low-latency guarantees.

10536

++ *

10537

++ * WARNING: queue merging may impair fairness among non-weight raised

10538

++ * queues, for at least two reasons: 1) the original weight of a

10539

++ * merged queue may change during the merged state, 2) even being the

10540

++ * weight the same, a merged queue may be bloated with many more

10541

++ * requests than the ones produced by its originally-associated

10542

++ * process.

10543

+  */

10544

+ static struct bfq_queue *

10545

+ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

10546

+@@ -1451,16 +1862,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

10547

+

10548

+ 	if (bfqq->new_bfqq)

10549

+ 		return bfqq->new_bfqq;

10550

+-	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

10551

++

10552

++	if (io_struct && wr_from_too_long(bfqq) &&

10553

++	    likely(bfqq != &bfqd->oom_bfqq))

10554

++		bfq_log_bfqq(bfqd, bfqq,

10555

++			     "would have looked for coop, but bfq%d wr",

10556

++			bfqq->pid);

10557

++

10558

++	if (!io_struct ||

10559

++	    wr_from_too_long(bfqq) ||

10560

++	    unlikely(bfqq == &bfqd->oom_bfqq))

10561

+ 		return NULL;

10562

+-	/* If device has only one backlogged bfq_queue, don't search. */

10563

++

10564

++	/* If there is only one backlogged queue, don't search. */

10565

+ 	if (bfqd->busy_queues == 1)

10566

+ 		return NULL;

10567

+

10568

+ 	in_service_bfqq = bfqd->in_service_queue;

10569

+

10570

++	if (in_service_bfqq && in_service_bfqq != bfqq &&

10571

++	    bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)

10572

++	    && likely(in_service_bfqq == &bfqd->oom_bfqq))

10573

++		bfq_log_bfqq(bfqd, bfqq,

10574

++		"would have tried merge with in-service-queue, but wr");

10575

++

10576

+ 	if (!in_service_bfqq || in_service_bfqq == bfqq ||

10577

+-	    !bfqd->in_service_bic ||

10578

++	    !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||

10579

+ 	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

10580

+ 		goto check_scheduled;

10581

+

10582

+@@ -1482,7 +1909,15 @@ check_scheduled:

10583

+

10584

+ 	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);

10585

+

10586

+-	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&

10587

++	if (new_bfqq && wr_from_too_long(new_bfqq) &&

10588

++	    likely(new_bfqq != &bfqd->oom_bfqq) &&

10589

++	    bfq_may_be_close_cooperator(bfqq, new_bfqq))

10590

++		bfq_log_bfqq(bfqd, bfqq,

10591

++			     "would have merged with bfq%d, but wr",

10592

++			     new_bfqq->pid);

10593

++

10594

++	if (new_bfqq && !wr_from_too_long(new_bfqq) &&

10595

++	    likely(new_bfqq != &bfqd->oom_bfqq) &&

10596

+ 	    bfq_may_be_close_cooperator(bfqq, new_bfqq))

10597

+ 		return bfq_setup_merge(bfqq, new_bfqq);

10598

+

10599

+@@ -1498,46 +1933,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)

10600

+ 	 */

10601

+ 	if (!bfqq->bic)

10602

+ 		return;

10603

+-	if (bfqq->bic->wr_time_left)

10604

+-		/*

10605

+-		 * This is the queue of a just-started process, and would

10606

+-		 * deserve weight raising: we set wr_time_left to the full

10607

+-		 * weight-raising duration to trigger weight-raising when

10608

+-		 * and if the queue is split and the first request of the

10609

+-		 * queue is enqueued.

10610

+-		 */

10611

+-		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

10612

+-	else if (bfqq->wr_coeff > 1) {

10613

+-		unsigned long wr_duration =

10614

+-			jiffies - bfqq->last_wr_start_finish;

10615

+-		/*

10616

+-		 * It may happen that a queue's weight raising period lasts

10617

+-		 * longer than its wr_cur_max_time, as weight raising is

10618

+-		 * handled only when a request is enqueued or dispatched (it

10619

+-		 * does not use any timer). If the weight raising period is

10620

+-		 * about to end, don't save it.

10621

+-		 */

10622

+-		if (bfqq->wr_cur_max_time <= wr_duration)

10623

+-			bfqq->bic->wr_time_left = 0;

10624

+-		else

10625

+-			bfqq->bic->wr_time_left =

10626

+-				bfqq->wr_cur_max_time - wr_duration;

10627

+-		/*

10628

+-		 * The bfq_queue is becoming shared or the requests of the

10629

+-		 * process owning the queue are being redirected to a shared

10630

+-		 * queue. Stop the weight raising period of the queue, as in

10631

+-		 * both cases it should not be owned by an interactive or

10632

+-		 * soft real-time application.

10633

+-		 */

10634

+-		bfq_bfqq_end_wr(bfqq);

10635

+-	} else

10636

+-		bfqq->bic->wr_time_left = 0;

10637

++

10638

+ 	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

10639

+ 	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

10640

+ 	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

10641

+ 	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

10642

+-	bfqq->bic->cooperations++;

10643

+-	bfqq->bic->failed_cooperations = 0;

10644

+ }

10645

+

10646

+ static void bfq_get_bic_reference(struct bfq_queue *bfqq)

10647

+@@ -1562,6 +1962,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

10648

+ 	if (bfq_bfqq_IO_bound(bfqq))

10649

+ 		bfq_mark_bfqq_IO_bound(new_bfqq);

10650

+ 	bfq_clear_bfqq_IO_bound(bfqq);

10651

++

10652

++	/*

10653

++	 * If bfqq is weight-raised, then let new_bfqq inherit

10654

++	 * weight-raising. To reduce false positives, neglect the case

10655

++	 * where bfqq has just been created, but has not yet made it

10656

++	 * to be weight-raised (which may happen because EQM may merge

10657

++	 * bfqq even before bfq_add_request is executed for the first

10658

++	 * time for bfqq). Handling this case would however be very

10659

++	 * easy, thanks to the flag just_created.

10660

++	 */

10661

++	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {

10662

++		new_bfqq->wr_coeff = bfqq->wr_coeff;

10663

++		new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;

10664

++		new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;

10665

++		if (bfq_bfqq_busy(new_bfqq))

10666

++		    bfqd->wr_busy_queues++;

10667

++		new_bfqq->entity.prio_changed = 1;

10668

++		bfq_log_bfqq(bfqd, new_bfqq,

10669

++			     "wr starting after merge with %d, "

10670

++			     "rais_max_time %u",

10671

++			     bfqq->pid,

10672

++			     jiffies_to_msecs(bfqq->wr_cur_max_time));

10673

++	}

10674

++

10675

++	if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */

10676

++		bfqq->wr_coeff = 1;

10677

++		bfqq->entity.prio_changed = 1;

10678

++		if (bfq_bfqq_busy(bfqq))

10679

++			bfqd->wr_busy_queues--;

10680

++	}

10681

++

10682

++	bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",

10683

++		     bfqd->wr_busy_queues);

10684

++

10685

+ 	/*

10686

+ 	 * Grab a reference to the bic, to prevent it from being destroyed

10687

+ 	 * before being possibly touched by a bfq_split_bfqq().

10688

+@@ -1588,18 +2022,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

10689

+ 	bfq_put_queue(bfqq);

10690

+ }

10691

+

10692

+-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

10693

+-{

10694

+-	struct bfq_io_cq *bic = bfqq->bic;

10695

+-	struct bfq_data *bfqd = bfqq->bfqd;

10696

+-

10697

+-	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

10698

+-		bic->failed_cooperations++;

10699

+-		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

10700

+-			bic->cooperations = 0;

10701

+-	}

10702

+-}

10703

+-

10704

+ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

10705

+ 			   struct bio *bio)

10706

+ {

10707

+@@ -1637,30 +2059,86 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

10708

+ 			 * to decide whether bio and rq can be merged.

10709

+ 			 */

10710

+ 			bfqq = new_bfqq;

10711

+-		} else

10712

+-			bfq_bfqq_increase_failed_cooperations(bfqq);

10713

++		}

10714

+ 	}

10715

+

10716

+ 	return bfqq == RQ_BFQQ(rq);

10717

+ }

10718

+

10719

++/*

10720

++ * Set the maximum time for the in-service queue to consume its

10721

++ * budget. This prevents seeky processes from lowering the throughput.

10722

++ * In practice, a time-slice service scheme is used with seeky

10723

++ * processes.

10724

++ */

10725

++static void bfq_set_budget_timeout(struct bfq_data *bfqd,

10726

++				   struct bfq_queue *bfqq)

10727

++{

10728

++	unsigned int timeout_coeff;

10729

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

10730

++		timeout_coeff = 1;

10731

++	else

10732

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

10733

++

10734

++	bfqd->last_budget_start = ktime_get();

10735

++

10736

++	bfqq->budget_timeout = jiffies +

10737

++		bfqd->bfq_timeout * timeout_coeff;

10738

++

10739

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

10740

++		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));

10741

++}

10742

++

10743

+ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

10744

+ 				       struct bfq_queue *bfqq)

10745

+ {

10746

+ 	if (bfqq) {

10747

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10748

+ 		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));

10749

+-#endif

10750

+ 		bfq_mark_bfqq_must_alloc(bfqq);

10751

+-		bfq_mark_bfqq_budget_new(bfqq);

10752

+ 		bfq_clear_bfqq_fifo_expire(bfqq);

10753

+

10754

+ 		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

10755

+

10756

++		BUG_ON(bfqq == bfqd->in_service_queue);

10757

++		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

10758

++

10759

++		if (bfqq->wr_coeff > 1 &&

10760

++		    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&

10761

++			time_is_before_jiffies(bfqq->budget_timeout)) {

10762

++			/*

10763

++			 * For soft real-time queues, move the start

10764

++			 * of the weight-raising period forward by the

10765

++			 * time the queue has not received any

10766

++			 * service. Otherwise, a relatively long

10767

++			 * service delay is likely to cause the

10768

++			 * weight-raising period of the queue to end,

10769

++			 * because of the short duration of the

10770

++			 * weight-raising period of a soft real-time

10771

++			 * queue.  It is worth noting that this move

10772

++			 * is not so dangerous for the other queues,

10773

++			 * because soft real-time queues are not

10774

++			 * greedy.

10775

++			 *

10776

++			 * To not add a further variable, we use the

10777

++			 * overloaded field budget_timeout to

10778

++			 * determine for how long the queue has not

10779

++			 * received service, i.e., how much time has

10780

++			 * elapsed since the queue expired. However,

10781

++			 * this is a little imprecise, because

10782

++			 * budget_timeout is set to jiffies if bfqq

10783

++			 * not only expires, but also remains with no

10784

++			 * request.

10785

++			 */

10786

++			bfqq->last_wr_start_finish += jiffies -

10787

++				bfqq->budget_timeout;

10788

++		}

10789

++

10790

++		bfq_set_budget_timeout(bfqd, bfqq);

10791

+ 		bfq_log_bfqq(bfqd, bfqq,

10792

+ 			     "set_in_service_queue, cur-budget = %d",

10793

+ 			     bfqq->entity.budget);

10794

+-	}

10795

++	} else

10796

++		bfq_log(bfqd, "set_in_service_queue: NULL");

10797

+

10798

+ 	bfqd->in_service_queue = bfqq;

10799

+ }

10800

+@@ -1676,31 +2154,6 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

10801

+ 	return bfqq;

10802

+ }

10803

+

10804

+-/*

10805

+- * If enough samples have been computed, return the current max budget

10806

+- * stored in bfqd, which is dynamically updated according to the

10807

+- * estimated disk peak rate; otherwise return the default max budget

10808

+- */

10809

+-static int bfq_max_budget(struct bfq_data *bfqd)

10810

+-{

10811

+-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

10812

+-		return bfq_default_max_budget;

10813

+-	else

10814

+-		return bfqd->bfq_max_budget;

10815

+-}

10816

+-

10817

+-/*

10818

+- * Return min budget, which is a fraction of the current or default

10819

+- * max budget (trying with 1/32)

10820

+- */

10821

+-static int bfq_min_budget(struct bfq_data *bfqd)

10822

+-{

10823

+-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

10824

+-		return bfq_default_max_budget / 32;

10825

+-	else

10826

+-		return bfqd->bfq_max_budget / 32;

10827

+-}

10828

+-

10829

+ static void bfq_arm_slice_timer(struct bfq_data *bfqd)

10830

+ {

10831

+ 	struct bfq_queue *bfqq = bfqd->in_service_queue;

10832

+@@ -1723,64 +2176,36 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)

10833

+ 	 *

10834

+ 	 * To prevent processes with (partly) seeky workloads from

10835

+ 	 * being too ill-treated, grant them a small fraction of the

10836

+-	 * assigned budget before reducing the waiting time to

10837

+-	 * BFQ_MIN_TT. This happened to help reduce latency.

10838

+-	 */

10839

+-	sl = bfqd->bfq_slice_idle;

10840

+-	/*

10841

+-	 * Unless the queue is being weight-raised or the scenario is

10842

+-	 * asymmetric, grant only minimum idle time if the queue either

10843

+-	 * has been seeky for long enough or has already proved to be

10844

+-	 * constantly seeky.

10845

+-	 */

10846

+-	if (bfq_sample_valid(bfqq->seek_samples) &&

10847

+-	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

10848

+-				  bfq_max_budget(bfqq->bfqd) / 8) ||

10849

+-	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&

10850

+-	    bfq_symmetric_scenario(bfqd))

10851

+-		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

10852

+-	else if (bfqq->wr_coeff > 1)

10853

+-		sl = sl * 3;

10854

+-	bfqd->last_idling_start = ktime_get();

10855

+-	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

10856

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10857

+-	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));

10858

+-#endif

10859

+-	bfq_log(bfqd, "arm idle: %u/%u ms",

10860

+-		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

10861

+-}

10862

+-

10863

+-/*

10864

+- * Set the maximum time for the in-service queue to consume its

10865

+- * budget. This prevents seeky processes from lowering the disk

10866

+- * throughput (always guaranteed with a time slice scheme as in CFQ).

10867

+- */

10868

+-static void bfq_set_budget_timeout(struct bfq_data *bfqd)

10869

+-{

10870

+-	struct bfq_queue *bfqq = bfqd->in_service_queue;

10871

+-	unsigned int timeout_coeff;

10872

+-	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

10873

+-		timeout_coeff = 1;

10874

+-	else

10875

+-		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

10876

+-

10877

+-	bfqd->last_budget_start = ktime_get();

10878

+-

10879

+-	bfq_clear_bfqq_budget_new(bfqq);

10880

+-	bfqq->budget_timeout = jiffies +

10881

+-		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

10882

++	 * assigned budget before reducing the waiting time to

10883

++	 * BFQ_MIN_TT. This happened to help reduce latency.

10884

++	 */

10885

++	sl = bfqd->bfq_slice_idle;

10886

++	/*

10887

++	 * Unless the queue is being weight-raised or the scenario is

10888

++	 * asymmetric, grant only minimum idle time if the queue

10889

++	 * is seeky. A long idling is preserved for a weight-raised

10890

++	 * queue, or, more in general, in an asymemtric scenario,

10891

++	 * because a long idling is needed for guaranteeing to a queue

10892

++	 * its reserved share of the throughput (in particular, it is

10893

++	 * needed if the queue has a higher weight than some other

10894

++	 * queue).

10895

++	 */

10896

++	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&

10897

++	    bfq_symmetric_scenario(bfqd))

10898

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

10899

+

10900

+-	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

10901

+-		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

10902

+-		timeout_coeff));

10903

++	bfqd->last_idling_start = ktime_get();

10904

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

10905

++	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));

10906

++	bfq_log(bfqd, "arm idle: %u/%u ms",

10907

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

10908

+ }

10909

+

10910

+ /*

10911

+- * Move request from internal lists to the request queue dispatch list.

10912

++ * Move request from internal lists to the dispatch list of the request queue

10913

+  */

10914

+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

10915

+ {

10916

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

10917

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

10918

+

10919

+ 	/*

10920

+@@ -1794,15 +2219,9 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

10921

+ 	 * incrementing bfqq->dispatched.

10922

+ 	 */

10923

+ 	bfqq->dispatched++;

10924

++

10925

+ 	bfq_remove_request(rq);

10926

+ 	elv_dispatch_sort(q, rq);

10927

+-

10928

+-	if (bfq_bfqq_sync(bfqq))

10929

+-		bfqd->sync_flight++;

10930

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

10931

+-	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),

10932

+-				   rq->cmd_flags);

10933

+-#endif

10934

+ }

10935

+

10936

+ /*

10937

+@@ -1822,18 +2241,12 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

10938

+

10939

+ 	rq = rq_entry_fifo(bfqq->fifo.next);

10940

+

10941

+-	if (time_before(jiffies, rq->fifo_time))

10942

++	if (time_is_after_jiffies(rq->fifo_time))

10943

+ 		return NULL;

10944

+

10945

+ 	return rq;

10946

+ }

10947

+

10948

+-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)

10949

+-{

10950

+-	struct bfq_entity *entity = &bfqq->entity;

10951

+-	return entity->budget - entity->service;

10952

+-}

10953

+-

10954

+ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

10955

+ {

10956

+ 	BUG_ON(bfqq != bfqd->in_service_queue);

10957

+@@ -1850,12 +2263,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

10958

+ 		bfq_mark_bfqq_split_coop(bfqq);

10959

+

10960

+ 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

10961

+-		/*

10962

+-		 * Overloading budget_timeout field to store the time

10963

+-		 * at which the queue remains with no backlog; used by

10964

+-		 * the weight-raising mechanism.

10965

+-		 */

10966

+-		bfqq->budget_timeout = jiffies;

10967

++		if (bfqq->dispatched == 0)

10968

++			/*

10969

++			 * Overloading budget_timeout field to store

10970

++			 * the time at which the queue remains with no

10971

++			 * backlog and no outstanding request; used by

10972

++			 * the weight-raising mechanism.

10973

++			 */

10974

++			bfqq->budget_timeout = jiffies;

10975

++

10976

+ 		bfq_del_bfqq_busy(bfqd, bfqq, 1);

10977

+ 	} else {

10978

+ 		bfq_activate_bfqq(bfqd, bfqq);

10979

+@@ -1882,10 +2298,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

10980

+ 	struct request *next_rq;

10981

+ 	int budget, min_budget;

10982

+

10983

+-	budget = bfqq->max_budget;

10984

++	BUG_ON(bfqq != bfqd->in_service_queue);

10985

++

10986

+ 	min_budget = bfq_min_budget(bfqd);

10987

+

10988

+-	BUG_ON(bfqq != bfqd->in_service_queue);

10989

++	if (bfqq->wr_coeff == 1)

10990

++		budget = bfqq->max_budget;

10991

++	else /*

10992

++	      * Use a constant, low budget for weight-raised queues,

10993

++	      * to help achieve a low latency. Keep it slightly higher

10994

++	      * than the minimum possible budget, to cause a little

10995

++	      * bit fewer expirations.

10996

++	      */

10997

++		budget = 2 * min_budget;

10998

+

10999

+ 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",

11000

+ 		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

11001

+@@ -1894,7 +2319,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

11002

+ 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

11003

+ 		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

11004

+

11005

+-	if (bfq_bfqq_sync(bfqq)) {

11006

++	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {

11007

+ 		switch (reason) {

11008

+ 		/*

11009

+ 		 * Caveat: in all the following cases we trade latency

11010

+@@ -1936,14 +2361,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

11011

+ 			break;

11012

+ 		case BFQ_BFQQ_BUDGET_TIMEOUT:

11013

+ 			/*

11014

+-			 * We double the budget here because: 1) it

11015

+-			 * gives the chance to boost the throughput if

11016

+-			 * this is not a seeky process (which may have

11017

+-			 * bumped into this timeout because of, e.g.,

11018

+-			 * ZBR), 2) together with charge_full_budget

11019

+-			 * it helps give seeky processes higher

11020

+-			 * timestamps, and hence be served less

11021

+-			 * frequently.

11022

++			 * We double the budget here because it gives

11023

++			 * the chance to boost the throughput if this

11024

++			 * is not a seeky process (and has bumped into

11025

++			 * this timeout because of, e.g., ZBR).

11026

+ 			 */

11027

+ 			budget = min(budget * 2, bfqd->bfq_max_budget);

11028

+ 			break;

11029

+@@ -1960,17 +2381,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

11030

+ 			budget = min(budget * 4, bfqd->bfq_max_budget);

11031

+ 			break;

11032

+ 		case BFQ_BFQQ_NO_MORE_REQUESTS:

11033

+-		       /*

11034

+-			* Leave the budget unchanged.

11035

+-			*/

11036

++			/*

11037

++			 * For queues that expire for this reason, it

11038

++			 * is particularly important to keep the

11039

++			 * budget close to the actual service they

11040

++			 * need. Doing so reduces the timestamp

11041

++			 * misalignment problem described in the

11042

++			 * comments in the body of

11043

++			 * __bfq_activate_entity. In fact, suppose

11044

++			 * that a queue systematically expires for

11045

++			 * BFQ_BFQQ_NO_MORE_REQUESTS and presents a

11046

++			 * new request in time to enjoy timestamp

11047

++			 * back-shifting. The larger the budget of the

11048

++			 * queue is with respect to the service the

11049

++			 * queue actually requests in each service

11050

++			 * slot, the more times the queue can be

11051

++			 * reactivated with the same virtual finish

11052

++			 * time. It follows that, even if this finish

11053

++			 * time is pushed to the system virtual time

11054

++			 * to reduce the consequent timestamp

11055

++			 * misalignment, the queue unjustly enjoys for

11056

++			 * many re-activations a lower finish time

11057

++			 * than all newly activated queues.

11058

++			 *

11059

++			 * The service needed by bfqq is measured

11060

++			 * quite precisely by bfqq->entity.service.

11061

++			 * Since bfqq does not enjoy device idling,

11062

++			 * bfqq->entity.service is equal to the number

11063

++			 * of sectors that the process associated with

11064

++			 * bfqq requested to read/write before waiting

11065

++			 * for request completions, or blocking for

11066

++			 * other reasons.

11067

++			 */

11068

++			budget = max_t(int, bfqq->entity.service, min_budget);

11069

++			break;

11070

+ 		default:

11071

+ 			return;

11072

+ 		}

11073

+-	} else

11074

++	} else if (!bfq_bfqq_sync(bfqq))

11075

+ 		/*

11076

+-		 * Async queues get always the maximum possible budget

11077

+-		 * (their ability to dispatch is limited by

11078

+-		 * @bfqd->bfq_max_budget_async_rq).

11079

++		 * Async queues get always the maximum possible

11080

++		 * budget, as for them we do not care about latency

11081

++		 * (in addition, their ability to dispatch is limited

11082

++		 * by the charging factor).

11083

+ 		 */

11084

+ 		budget = bfqd->bfq_max_budget;

11085

+

11086

+@@ -1981,65 +2434,105 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

11087

+ 		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);

11088

+

11089

+ 	/*

11090

+-	 * Make sure that we have enough budget for the next request.

11091

+-	 * Since the finish time of the bfqq must be kept in sync with

11092

+-	 * the budget, be sure to call __bfq_bfqq_expire() after the

11093

++	 * If there is still backlog, then assign a new budget, making

11094

++	 * sure that it is large enough for the next request.  Since

11095

++	 * the finish time of bfqq must be kept in sync with the

11096

++	 * budget, be sure to call __bfq_bfqq_expire() *after* this

11097

+ 	 * update.

11098

++	 *

11099

++	 * If there is no backlog, then no need to update the budget;

11100

++	 * it will be updated on the arrival of a new request.

11101

+ 	 */

11102

+ 	next_rq = bfqq->next_rq;

11103

+-	if (next_rq)

11104

++	if (next_rq) {

11105

++		BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||

11106

++		       reason == BFQ_BFQQ_NO_MORE_REQUESTS);

11107

+ 		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

11108

+ 					    bfq_serv_to_charge(next_rq, bfqq));

11109

+-	else

11110

+-		bfqq->entity.budget = bfqq->max_budget;

11111

++		BUG_ON(!bfq_bfqq_busy(bfqq));

11112

++		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

11113

++	}

11114

+

11115

+ 	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",

11116

+ 			next_rq ? blk_rq_sectors(next_rq) : 0,

11117

+ 			bfqq->entity.budget);

11118

+ }

11119

+

11120

+-static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

11121

++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)

11122

+ {

11123

+-	unsigned long max_budget;

11124

+-

11125

+ 	/*

11126

+ 	 * The max_budget calculated when autotuning is equal to the

11127

+-	 * amount of sectors transfered in timeout_sync at the

11128

++	 * amount of sectors transfered in timeout at the

11129

+ 	 * estimated peak rate.

11130

+ 	 */

11131

+-	max_budget = (unsigned long)(peak_rate * 1000 *

11132

+-				     timeout >> BFQ_RATE_SHIFT);

11133

+-

11134

+-	return max_budget;

11135

++	return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >>

11136

++		BFQ_RATE_SHIFT;

11137

+ }

11138

+

11139

+ /*

11140

+- * In addition to updating the peak rate, checks whether the process

11141

+- * is "slow", and returns 1 if so. This slow flag is used, in addition

11142

+- * to the budget timeout, to reduce the amount of service provided to

11143

+- * seeky processes, and hence reduce their chances to lower the

11144

+- * throughput. See the code for more details.

11145

++ * Update the read peak rate (quantity used for auto-tuning) as a

11146

++ * function of the rate at which bfqq has been served, and check

11147

++ * whether the process associated with bfqq is "slow". Return true if

11148

++ * the process is slow. The slow flag is used, in addition to the

11149

++ * budget timeout, to reduce the amount of service provided to seeky

11150

++ * processes, and hence reduce their chances to lower the

11151

++ * throughput. More details in the body of the function.

11152

++ *

11153

++ * An important observation is in order: with devices with internal

11154

++ * queues, it is hard if ever possible to know when and for how long

11155

++ * an I/O request is processed by the device (apart from the trivial

11156

++ * I/O pattern where a new request is dispatched only after the

11157

++ * previous one has been completed). This makes it hard to evaluate

11158

++ * the real rate at which the I/O requests of each bfq_queue are

11159

++ * served.  In fact, for an I/O scheduler like BFQ, serving a

11160

++ * bfq_queue means just dispatching its requests during its service

11161

++ * slot, i.e., until the budget of the queue is exhausted, or the

11162

++ * queue remains idle, or, finally, a timeout fires. But, during the

11163

++ * service slot of a bfq_queue, the device may be still processing

11164

++ * requests of bfq_queues served in previous service slots. On the

11165

++ * opposite end, the requests of the in-service bfq_queue may be

11166

++ * completed after the service slot of the queue finishes. Anyway,

11167

++ * unless more sophisticated solutions are used (where possible), the

11168

++ * sum of the sizes of the requests dispatched during the service slot

11169

++ * of a bfq_queue is probably the only approximation available for

11170

++ * the service received by the bfq_queue during its service slot. And,

11171

++ * as written above, this sum is the quantity used in this function to

11172

++ * evaluate the peak rate.

11173

+  */

11174

+ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11175

+-				 bool compensate, enum bfqq_expiration reason)

11176

++				 bool compensate, enum bfqq_expiration reason,

11177

++				 unsigned long *delta_ms)

11178

+ {

11179

+-	u64 bw, usecs, expected, timeout;

11180

+-	ktime_t delta;

11181

++	u64 bw, bwdiv10, delta_usecs, delta_ms_tmp;

11182

++	ktime_t delta_ktime;

11183

+ 	int update = 0;

11184

++	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */

11185

+

11186

+-	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

11187

++	if (!bfq_bfqq_sync(bfqq))

11188

+ 		return false;

11189

+

11190

+ 	if (compensate)

11191

+-		delta = bfqd->last_idling_start;

11192

++		delta_ktime = bfqd->last_idling_start;

11193

+ 	else

11194

+-		delta = ktime_get();

11195

+-	delta = ktime_sub(delta, bfqd->last_budget_start);

11196

+-	usecs = ktime_to_us(delta);

11197

++		delta_ktime = ktime_get();

11198

++	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);

11199

++	delta_usecs = ktime_to_us(delta_ktime);

11200

+

11201

+ 	/* Don't trust short/unrealistic values. */

11202

+-	if (usecs < 100 || usecs >= LONG_MAX)

11203

+-		return false;

11204

++	if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {

11205

++		if (blk_queue_nonrot(bfqd->queue))

11206

++			*delta_ms = BFQ_MIN_TT; /* give same worst-case

11207

++						   guarantees as

11208

++						   idling for seeky

11209

++						*/

11210

++		else /* Charge at least one seek */

11211

++			*delta_ms = jiffies_to_msecs(bfq_slice_idle);

11212

++		return slow;

11213

++	}

11214

++

11215

++	delta_ms_tmp = delta_usecs;

11216

++	do_div(delta_ms_tmp, 1000);

11217

++	*delta_ms = delta_ms_tmp;

11218

+

11219

+ 	/*

11220

+ 	 * Calculate the bandwidth for the last slice.  We use a 64 bit

11221

+@@ -2048,32 +2541,51 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11222

+ 	 * and to avoid overflows.

11223

+ 	 */

11224

+ 	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

11225

+-	do_div(bw, (unsigned long)usecs);

11226

+-

11227

+-	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

11228

++	do_div(bw, (unsigned long)delta_usecs);

11229

+

11230

++	bfq_log(bfqd, "measured bw = %llu sects/sec",

11231

++		(1000000*bw)>>BFQ_RATE_SHIFT);

11232

+ 	/*

11233

+ 	 * Use only long (> 20ms) intervals to filter out spikes for

11234

+ 	 * the peak rate estimation.

11235

+ 	 */

11236

+-	if (usecs > 20000) {

11237

++	if (delta_usecs > 20000) {

11238

++		bool fully_sequential = bfqq->seek_history == 0;

11239

++		/*

11240

++		 * Soft real-time queues are not good candidates for

11241

++		 * evaluating bw, as they are likely to be slow even

11242

++		 * if sequential.

11243

++		 */

11244

++		bool non_soft_rt = bfqq->wr_coeff == 1 ||

11245

++			bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time;

11246

++		bool consumed_large_budget =

11247

++			reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&

11248

++			bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3;

11249

++		bool served_for_long_time =

11250

++			reason == BFQ_BFQQ_BUDGET_TIMEOUT ||

11251

++			consumed_large_budget;

11252

++

11253

++		BUG_ON(bfqq->seek_history == 0 &&

11254

++		       hweight32(bfqq->seek_history) != 0);

11255

++

11256

+ 		if (bw > bfqd->peak_rate ||

11257

+-		   (!BFQQ_SEEKY(bfqq) &&

11258

+-		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

11259

+-			bfq_log(bfqd, "measured bw =%llu", bw);

11260

++		    (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt &&

11261

++		     served_for_long_time)) {

11262

+ 			/*

11263

+ 			 * To smooth oscillations use a low-pass filter with

11264

+-			 * alpha=7/8, i.e.,

11265

+-			 * new_rate = (7/8) * old_rate + (1/8) * bw

11266

++			 * alpha=9/10, i.e.,

11267

++			 * new_rate = (9/10) * old_rate + (1/10) * bw

11268

+ 			 */

11269

+-			do_div(bw, 8);

11270

+-			if (bw == 0)

11271

+-				return 0;

11272

+-			bfqd->peak_rate *= 7;

11273

+-			do_div(bfqd->peak_rate, 8);

11274

+-			bfqd->peak_rate += bw;

11275

++			bwdiv10 = bw;

11276

++			do_div(bwdiv10, 10);

11277

++			if (bwdiv10 == 0)

11278

++				return false; /* bw too low to be used */

11279

++			bfqd->peak_rate *= 9;

11280

++			do_div(bfqd->peak_rate, 10);

11281

++			bfqd->peak_rate += bwdiv10;

11282

+ 			update = 1;

11283

+-			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

11284

++			bfq_log(bfqd, "new peak_rate = %llu sects/sec",

11285

++				(1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT);

11286

+ 		}

11287

+

11288

+ 		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

11289

+@@ -2086,9 +2598,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11290

+ 			int dev_type = blk_queue_nonrot(bfqd->queue);

11291

+ 			if (bfqd->bfq_user_max_budget == 0) {

11292

+ 				bfqd->bfq_max_budget =

11293

+-					bfq_calc_max_budget(bfqd->peak_rate,

11294

+-							    timeout);

11295

+-				bfq_log(bfqd, "new max_budget=%d",

11296

++					bfq_calc_max_budget(bfqd);

11297

++				bfq_log(bfqd, "new max_budget = %d",

11298

+ 					bfqd->bfq_max_budget);

11299

+ 			}

11300

+ 			if (bfqd->device_speed == BFQ_BFQD_FAST &&

11301

+@@ -2102,38 +2613,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11302

+ 				bfqd->RT_prod = R_fast[dev_type] *

11303

+ 						T_fast[dev_type];

11304

+ 			}

11305

++			bfq_log(bfqd, "dev_speed_class = %d (%d sects/sec), "

11306

++				"thresh %d setcs/sec",

11307

++				bfqd->device_speed,

11308

++				bfqd->device_speed == BFQ_BFQD_FAST ?

11309

++				(1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT :

11310

++				(1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT,

11311

++				(1000000*device_speed_thresh[dev_type])>>

11312

++				BFQ_RATE_SHIFT);

11313

+ 		}

11314

++		/*

11315

++		 * Caveat: processes doing IO in the slower disk zones

11316

++		 * tend to be slow(er) even if not seeky. In this

11317

++		 * respect, the estimated peak rate is likely to be an

11318

++		 * average over the disk surface. Accordingly, to not

11319

++		 * be too harsh with unlucky processes, a process is

11320

++		 * deemed slow only if its bw has been lower than half

11321

++		 * of the estimated peak rate.

11322

++		 */

11323

++		slow = bw < bfqd->peak_rate / 2;

11324

+ 	}

11325

+

11326

+-	/*

11327

+-	 * If the process has been served for a too short time

11328

+-	 * interval to let its possible sequential accesses prevail on

11329

+-	 * the initial seek time needed to move the disk head on the

11330

+-	 * first sector it requested, then give the process a chance

11331

+-	 * and for the moment return false.

11332

+-	 */

11333

+-	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

11334

+-		return false;

11335

+-

11336

+-	/*

11337

+-	 * A process is considered ``slow'' (i.e., seeky, so that we

11338

+-	 * cannot treat it fairly in the service domain, as it would

11339

+-	 * slow down too much the other processes) if, when a slice

11340

+-	 * ends for whatever reason, it has received service at a

11341

+-	 * rate that would not be high enough to complete the budget

11342

+-	 * before the budget timeout expiration.

11343

+-	 */

11344

+-	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

11345

++	bfq_log_bfqq(bfqd, bfqq,

11346

++		     "update_peak_rate: bw %llu sect/s, peak rate %llu, "

11347

++		     "slow %d",

11348

++		     (1000000*bw)>>BFQ_RATE_SHIFT,

11349

++		     (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT,

11350

++		     bw < bfqd->peak_rate / 2);

11351

+

11352

+-	/*

11353

+-	 * Caveat: processes doing IO in the slower disk zones will

11354

+-	 * tend to be slow(er) even if not seeky. And the estimated

11355

+-	 * peak rate will actually be an average over the disk

11356

+-	 * surface. Hence, to not be too harsh with unlucky processes,

11357

+-	 * we keep a budget/3 margin of safety before declaring a

11358

+-	 * process slow.

11359

+-	 */

11360

+-	return expected > (4 * bfqq->entity.budget) / 3;

11361

++	return slow;

11362

+ }

11363

+

11364

+ /*

11365

+@@ -2191,6 +2699,15 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11366

+ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

11367

+ 						struct bfq_queue *bfqq)

11368

+ {

11369

++	bfq_log_bfqq(bfqd, bfqq,

11370

++		     "softrt_next_start: service_blkg %lu "

11371

++		     "soft_rate %u sects/sec"

11372

++		     "interval %u",

11373

++		     bfqq->service_from_backlogged,

11374

++		     bfqd->bfq_wr_max_softrt_rate,

11375

++		     jiffies_to_msecs(HZ * bfqq->service_from_backlogged /

11376

++				      bfqd->bfq_wr_max_softrt_rate));

11377

++

11378

+ 	return max(bfqq->last_idle_bklogged +

11379

+ 		   HZ * bfqq->service_from_backlogged /

11380

+ 		   bfqd->bfq_wr_max_softrt_rate,

11381

+@@ -2198,13 +2715,21 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

11382

+ }

11383

+

11384

+ /*

11385

+- * Return the largest-possible time instant such that, for as long as possible,

11386

+- * the current time will be lower than this time instant according to the macro

11387

+- * time_is_before_jiffies().

11388

++ * Return the farthest future time instant according to jiffies

11389

++ * macros.

11390

++ */

11391

++static unsigned long bfq_greatest_from_now(void)

11392

++{

11393

++	return jiffies + MAX_JIFFY_OFFSET;

11394

++}

11395

++

11396

++/*

11397

++ * Return the farthest past time instant according to jiffies

11398

++ * macros.

11399

+  */

11400

+-static unsigned long bfq_infinity_from_now(unsigned long now)

11401

++static unsigned long bfq_smallest_from_now(void)

11402

+ {

11403

+-	return now + ULONG_MAX / 2;

11404

++	return jiffies - MAX_JIFFY_OFFSET;

11405

+ }

11406

+

11407

+ /**

11408

+@@ -2214,28 +2739,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now)

11409

+  * @compensate: if true, compensate for the time spent idling.

11410

+  * @reason: the reason causing the expiration.

11411

+  *

11412

++ * If the process associated with bfqq does slow I/O (e.g., because it

11413

++ * issues random requests), we charge bfqq with the time it has been

11414

++ * in service instead of the service it has received (see

11415

++ * bfq_bfqq_charge_time for details on how this goal is achieved). As

11416

++ * a consequence, bfqq will typically get higher timestamps upon

11417

++ * reactivation, and hence it will be rescheduled as if it had

11418

++ * received more service than what it has actually received. In the

11419

++ * end, bfqq receives less service in proportion to how slowly its

11420

++ * associated process consumes its budgets (and hence how seriously it

11421

++ * tends to lower the throughput). In addition, this time-charging

11422

++ * strategy guarantees time fairness among slow processes. In

11423

++ * contrast, if the process associated with bfqq is not slow, we

11424

++ * charge bfqq exactly with the service it has received.

11425

+  *

11426

+- * If the process associated to the queue is slow (i.e., seeky), or in

11427

+- * case of budget timeout, or, finally, if it is async, we

11428

+- * artificially charge it an entire budget (independently of the

11429

+- * actual service it received). As a consequence, the queue will get

11430

+- * higher timestamps than the correct ones upon reactivation, and

11431

+- * hence it will be rescheduled as if it had received more service

11432

+- * than what it actually received. In the end, this class of processes

11433

+- * will receive less service in proportion to how slowly they consume

11434

+- * their budgets (and hence how seriously they tend to lower the

11435

+- * throughput).

11436

+- *

11437

+- * In contrast, when a queue expires because it has been idling for

11438

+- * too much or because it exhausted its budget, we do not touch the

11439

+- * amount of service it has received. Hence when the queue will be

11440

+- * reactivated and its timestamps updated, the latter will be in sync

11441

+- * with the actual service received by the queue until expiration.

11442

+- *

11443

+- * Charging a full budget to the first type of queues and the exact

11444

+- * service to the others has the effect of using the WF2Q+ policy to

11445

+- * schedule the former on a timeslice basis, without violating the

11446

+- * service domain guarantees of the latter.

11447

++ * Charging time to the first type of queues and the exact service to

11448

++ * the other has the effect of using the WF2Q+ policy to schedule the

11449

++ * former on a timeslice basis, without violating service domain

11450

++ * guarantees among the latter.

11451

+  */

11452

+ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11453

+ 			    struct bfq_queue *bfqq,

11454

+@@ -2243,40 +2764,51 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11455

+ 			    enum bfqq_expiration reason)

11456

+ {

11457

+ 	bool slow;

11458

++	unsigned long delta = 0;

11459

++	struct bfq_entity *entity = &bfqq->entity;

11460

++

11461

+ 	BUG_ON(bfqq != bfqd->in_service_queue);

11462

+

11463

+ 	/*

11464

+-	 * Update disk peak rate for autotuning and check whether the

11465

++	 * Update device peak rate for autotuning and check whether the

11466

+ 	 * process is slow (see bfq_update_peak_rate).

11467

+ 	 */

11468

+-	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

11469

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta);

11470

+

11471

+ 	/*

11472

+-	 * As above explained, 'punish' slow (i.e., seeky), timed-out

11473

+-	 * and async queues, to favor sequential sync workloads.

11474

+-	 *

11475

+-	 * Processes doing I/O in the slower disk zones will tend to be

11476

+-	 * slow(er) even if not seeky. Hence, since the estimated peak

11477

+-	 * rate is actually an average over the disk surface, these

11478

+-	 * processes may timeout just for bad luck. To avoid punishing

11479

+-	 * them we do not charge a full budget to a process that

11480

+-	 * succeeded in consuming at least 2/3 of its budget.

11481

++	 * Increase service_from_backlogged before next statement,

11482

++	 * because the possible next invocation of

11483

++	 * bfq_bfqq_charge_time would likely inflate

11484

++	 * entity->service. In contrast, service_from_backlogged must

11485

++	 * contain real service, to enable the soft real-time

11486

++	 * heuristic to correctly compute the bandwidth consumed by

11487

++	 * bfqq.

11488

+ 	 */

11489

+-	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

11490

+-		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

11491

+-		bfq_bfqq_charge_full_budget(bfqq);

11492

++	bfqq->service_from_backlogged += entity->service;

11493

+

11494

+-	bfqq->service_from_backlogged += bfqq->entity.service;

11495

+-

11496

+-	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

11497

+-	    !bfq_bfqq_constantly_seeky(bfqq)) {

11498

+-		bfq_mark_bfqq_constantly_seeky(bfqq);

11499

+-		if (!blk_queue_nonrot(bfqd->queue))

11500

+-			bfqd->const_seeky_busy_in_flight_queues++;

11501

+-	}

11502

++	/*

11503

++	 * As above explained, charge slow (typically seeky) and

11504

++	 * timed-out queues with the time and not the service

11505

++	 * received, to favor sequential workloads.

11506

++	 *

11507

++	 * Processes doing I/O in the slower disk zones will tend to

11508

++	 * be slow(er) even if not seeky. Therefore, since the

11509

++	 * estimated peak rate is actually an average over the disk

11510

++	 * surface, these processes may timeout just for bad luck. To

11511

++	 * avoid punishing them, do not charge time to processes that

11512

++	 * succeeded in consuming at least 2/3 of their budget. This

11513

++	 * allows BFQ to preserve enough elasticity to still perform

11514

++	 * bandwidth, and not time, distribution with little unlucky

11515

++	 * or quasi-sequential processes.

11516

++	 */

11517

++	if (bfqq->wr_coeff == 1 &&

11518

++	    (slow ||

11519

++	     (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

11520

++	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))

11521

++		bfq_bfqq_charge_time(bfqd, bfqq, delta);

11522

+

11523

+ 	if (reason == BFQ_BFQQ_TOO_IDLE &&

11524

+-	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

11525

++	    entity->service <= 2 * entity->budget / 10 )

11526

+ 		bfq_clear_bfqq_IO_bound(bfqq);

11527

+

11528

+ 	if (bfqd->low_latency && bfqq->wr_coeff == 1)

11529

+@@ -2285,19 +2817,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11530

+ 	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

11531

+ 	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

11532

+ 		/*

11533

+-		 * If we get here, and there are no outstanding requests,

11534

+-		 * then the request pattern is isochronous (see the comments

11535

+-		 * to the function bfq_bfqq_softrt_next_start()). Hence we

11536

+-		 * can compute soft_rt_next_start. If, instead, the queue

11537

+-		 * still has outstanding requests, then we have to wait

11538

+-		 * for the completion of all the outstanding requests to

11539

++		 * If we get here, and there are no outstanding

11540

++		 * requests, then the request pattern is isochronous

11541

++		 * (see the comments on the function

11542

++		 * bfq_bfqq_softrt_next_start()). Thus we can compute

11543

++		 * soft_rt_next_start. If, instead, the queue still

11544

++		 * has outstanding requests, then we have to wait for

11545

++		 * the completion of all the outstanding requests to

11546

+ 		 * discover whether the request pattern is actually

11547

+ 		 * isochronous.

11548

+ 		 */

11549

+-		if (bfqq->dispatched == 0)

11550

++		BUG_ON(bfqd->busy_queues < 1);

11551

++		if (bfqq->dispatched == 0) {

11552

+ 			bfqq->soft_rt_next_start =

11553

+ 				bfq_bfqq_softrt_next_start(bfqd, bfqq);

11554

+-		else {

11555

++			bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",

11556

++				     bfqq->soft_rt_next_start);

11557

++		} else {

11558

+ 			/*

11559

+ 			 * The application is still waiting for the

11560

+ 			 * completion of one or more requests:

11561

+@@ -2314,7 +2850,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11562

+ 			 *    happened to be in the past.

11563

+ 			 */

11564

+ 			bfqq->soft_rt_next_start =

11565

+-				bfq_infinity_from_now(jiffies);

11566

++				bfq_greatest_from_now();

11567

+ 			/*

11568

+ 			 * Schedule an update of soft_rt_next_start to when

11569

+ 			 * the task may be discovered to be isochronous.

11570

+@@ -2324,8 +2860,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11571

+ 	}

11572

+

11573

+ 	bfq_log_bfqq(bfqd, bfqq,

11574

+-		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

11575

+-		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

11576

++		"expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)",

11577

++		     reason, slow, bfqq->dispatched,

11578

++		     bfq_bfqq_idle_window(bfqq), entity->weight);

11579

+

11580

+ 	/*

11581

+ 	 * Increase, decrease or leave budget unchanged according to

11582

+@@ -2333,6 +2870,14 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11583

+ 	 */

11584

+ 	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

11585

+ 	__bfq_bfqq_expire(bfqd, bfqq);

11586

++

11587

++	BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&

11588

++		!bfq_class_idle(bfqq));

11589

++

11590

++	if (!bfq_bfqq_busy(bfqq) &&

11591

++	    reason != BFQ_BFQQ_BUDGET_TIMEOUT &&

11592

++	    reason != BFQ_BFQQ_BUDGET_EXHAUSTED)

11593

++		bfq_mark_bfqq_non_blocking_wait_rq(bfqq);

11594

+ }

11595

+

11596

+ /*

11597

+@@ -2342,20 +2887,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,

11598

+  */

11599

+ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

11600

+ {

11601

+-	if (bfq_bfqq_budget_new(bfqq) ||

11602

+-	    time_before(jiffies, bfqq->budget_timeout))

11603

+-		return false;

11604

+-	return true;

11605

++	return time_is_before_eq_jiffies(bfqq->budget_timeout);

11606

+ }

11607

+

11608

+ /*

11609

+- * If we expire a queue that is waiting for the arrival of a new

11610

+- * request, we may prevent the fictitious timestamp back-shifting that

11611

+- * allows the guarantees of the queue to be preserved (see [1] for

11612

+- * this tricky aspect). Hence we return true only if this condition

11613

+- * does not hold, or if the queue is slow enough to deserve only to be

11614

+- * kicked off for preserving a high throughput.

11615

+-*/

11616

++ * If we expire a queue that is actively waiting (i.e., with the

11617

++ * device idled) for the arrival of a new request, then we may incur

11618

++ * the timestamp misalignment problem described in the body of the

11619

++ * function __bfq_activate_entity. Hence we return true only if this

11620

++ * condition does not hold, or if the queue is slow enough to deserve

11621

++ * only to be kicked off for preserving a high throughput.

11622

++ */

11623

+ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

11624

+ {

11625

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq,

11626

+@@ -2397,10 +2939,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11627

+ {

11628

+ 	struct bfq_data *bfqd = bfqq->bfqd;

11629

+ 	bool idling_boosts_thr, idling_boosts_thr_without_issues,

11630

+-		all_queues_seeky, on_hdd_and_not_all_queues_seeky,

11631

+ 		idling_needed_for_service_guarantees,

11632

+ 		asymmetric_scenario;

11633

+

11634

++	if (bfqd->strict_guarantees)

11635

++		return true;

11636

++

11637

+ 	/*

11638

+ 	 * The next variable takes into account the cases where idling

11639

+ 	 * boosts the throughput.

11640

+@@ -2422,7 +2966,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11641

+ 	 */

11642

+ 	idling_boosts_thr = !bfqd->hw_tag ||

11643

+ 		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&

11644

+-		 bfq_bfqq_idle_window(bfqq)) ;

11645

++		 bfq_bfqq_idle_window(bfqq));

11646

+

11647

+ 	/*

11648

+ 	 * The value of the next variable,

11649

+@@ -2463,74 +3007,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11650

+ 		bfqd->wr_busy_queues == 0;

11651

+

11652

+ 	/*

11653

+-	 * There are then two cases where idling must be performed not

11654

++	 * There is then a case where idling must be performed not

11655

+ 	 * for throughput concerns, but to preserve service

11656

+-	 * guarantees. In the description of these cases, we say, for

11657

+-	 * short, that a queue is sequential/random if the process

11658

+-	 * associated to the queue issues sequential/random requests

11659

+-	 * (in the second case the queue may be tagged as seeky or

11660

+-	 * even constantly_seeky).

11661

+-	 *

11662

+-	 * To introduce the first case, we note that, since

11663

+-	 * bfq_bfqq_idle_window(bfqq) is false if the device is

11664

+-	 * NCQ-capable and bfqq is random (see

11665

+-	 * bfq_update_idle_window()), then, from the above two

11666

+-	 * assignments it follows that

11667

+-	 * idling_boosts_thr_without_issues is false if the device is

11668

+-	 * NCQ-capable and bfqq is random. Therefore, for this case,

11669

+-	 * device idling would never be allowed if we used just

11670

+-	 * idling_boosts_thr_without_issues to decide whether to allow

11671

+-	 * it. And, beneficially, this would imply that throughput

11672

+-	 * would always be boosted also with random I/O on NCQ-capable

11673

+-	 * HDDs.

11674

+-	 *

11675

+-	 * But we must be careful on this point, to avoid an unfair

11676

+-	 * treatment for bfqq. In fact, because of the same above

11677

+-	 * assignments, idling_boosts_thr_without_issues is, on the

11678

+-	 * other hand, true if 1) the device is an HDD and bfqq is

11679

+-	 * sequential, and 2) there are no busy weight-raised

11680

+-	 * queues. As a consequence, if we used just

11681

+-	 * idling_boosts_thr_without_issues to decide whether to idle

11682

+-	 * the device, then with an HDD we might easily bump into a

11683

+-	 * scenario where queues that are sequential and I/O-bound

11684

+-	 * would enjoy idling, whereas random queues would not. The

11685

+-	 * latter might then get a low share of the device throughput,

11686

+-	 * simply because the former would get many requests served

11687

+-	 * after being set as in service, while the latter would not.

11688

++	 * guarantees.

11689

+ 	 *

11690

+-	 * To address this issue, we start by setting to true a

11691

+-	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the

11692

+-	 * device is rotational and not all queues with pending or

11693

+-	 * in-flight requests are constantly seeky (i.e., there are

11694

+-	 * active sequential queues, and bfqq might then be mistreated

11695

+-	 * if it does not enjoy idling because it is random).

11696

+-	 */

11697

+-	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&

11698

+-			   bfqd->busy_in_flight_queues ==

11699

+-			   bfqd->const_seeky_busy_in_flight_queues;

11700

+-

11701

+-	on_hdd_and_not_all_queues_seeky =

11702

+-		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;

11703

+-

11704

+-	/*

11705

+-	 * To introduce the second case where idling needs to be

11706

+-	 * performed to preserve service guarantees, we can note that

11707

+-	 * allowing the drive to enqueue more than one request at a

11708

+-	 * time, and hence delegating de facto final scheduling

11709

+-	 * decisions to the drive's internal scheduler, causes loss of

11710

+-	 * control on the actual request service order. In particular,

11711

+-	 * the critical situation is when requests from different

11712

+-	 * processes happens to be present, at the same time, in the

11713

+-	 * internal queue(s) of the drive. In such a situation, the

11714

+-	 * drive, by deciding the service order of the

11715

+-	 * internally-queued requests, does determine also the actual

11716

+-	 * throughput distribution among these processes. But the

11717

+-	 * drive typically has no notion or concern about per-process

11718

+-	 * throughput distribution, and makes its decisions only on a

11719

+-	 * per-request basis. Therefore, the service distribution

11720

+-	 * enforced by the drive's internal scheduler is likely to

11721

+-	 * coincide with the desired device-throughput distribution

11722

+-	 * only in a completely symmetric scenario where:

11723

++	 * To introduce this case, we can note that allowing the drive

11724

++	 * to enqueue more than one request at a time, and hence

11725

++	 * delegating de facto final scheduling decisions to the

11726

++	 * drive's internal scheduler, entails loss of control on the

11727

++	 * actual request service order. In particular, the critical

11728

++	 * situation is when requests from different processes happen

11729

++	 * to be present, at the same time, in the internal queue(s)

11730

++	 * of the drive. In such a situation, the drive, by deciding

11731

++	 * the service order of the internally-queued requests, does

11732

++	 * determine also the actual throughput distribution among

11733

++	 * these processes. But the drive typically has no notion or

11734

++	 * concern about per-process throughput distribution, and

11735

++	 * makes its decisions only on a per-request basis. Therefore,

11736

++	 * the service distribution enforced by the drive's internal

11737

++	 * scheduler is likely to coincide with the desired

11738

++	 * device-throughput distribution only in a completely

11739

++	 * symmetric scenario where:

11740

+ 	 * (i)  each of these processes must get the same throughput as

11741

+ 	 *      the others;

11742

+ 	 * (ii) all these processes have the same I/O pattern

11743

+@@ -2552,26 +3049,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11744

+ 	 * words, only if sub-condition (i) holds, then idling is

11745

+ 	 * allowed, and the device tends to be prevented from queueing

11746

+ 	 * many requests, possibly of several processes. The reason

11747

+-	 * for not controlling also sub-condition (ii) is that, first,

11748

+-	 * in the case of an HDD, the asymmetry in terms of types of

11749

+-	 * I/O patterns is already taken in to account in the above

11750

+-	 * sentinel variable

11751

+-	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a

11752

+-	 * flash-based device, we prefer however to privilege

11753

+-	 * throughput (and idling lowers throughput for this type of

11754

+-	 * devices), for the following reasons:

11755

+-	 * 1) differently from HDDs, the service time of random

11756

+-	 *    requests is not orders of magnitudes lower than the service

11757

+-	 *    time of sequential requests; thus, even if processes doing

11758

+-	 *    sequential I/O get a preferential treatment with respect to

11759

+-	 *    others doing random I/O, the consequences are not as

11760

+-	 *    dramatic as with HDDs;

11761

+-	 * 2) if a process doing random I/O does need strong

11762

+-	 *    throughput guarantees, it is hopefully already being

11763

+-	 *    weight-raised, or the user is likely to have assigned it a

11764

+-	 *    higher weight than the other processes (and thus

11765

+-	 *    sub-condition (i) is likely to be false, which triggers

11766

+-	 *    idling).

11767

++	 * for not controlling also sub-condition (ii) is that we

11768

++	 * exploit preemption to preserve guarantees in case of

11769

++	 * symmetric scenarios, even if (ii) does not hold, as

11770

++	 * explained in the next two paragraphs.

11771

++	 *

11772

++	 * Even if a queue, say Q, is expired when it remains idle, Q

11773

++	 * can still preempt the new in-service queue if the next

11774

++	 * request of Q arrives soon (see the comments on

11775

++	 * bfq_bfqq_update_budg_for_activation). If all queues and

11776

++	 * groups have the same weight, this form of preemption,

11777

++	 * combined with the hole-recovery heuristic described in the

11778

++	 * comments on function bfq_bfqq_update_budg_for_activation,

11779

++	 * are enough to preserve a correct bandwidth distribution in

11780

++	 * the mid term, even without idling. In fact, even if not

11781

++	 * idling allows the internal queues of the device to contain

11782

++	 * many requests, and thus to reorder requests, we can rather

11783

++	 * safely assume that the internal scheduler still preserves a

11784

++	 * minimum of mid-term fairness. The motivation for using

11785

++	 * preemption instead of idling is that, by not idling,

11786

++	 * service guarantees are preserved without minimally

11787

++	 * sacrificing throughput. In other words, both a high

11788

++	 * throughput and its desired distribution are obtained.

11789

++	 *

11790

++	 * More precisely, this preemption-based, idleless approach

11791

++	 * provides fairness in terms of IOPS, and not sectors per

11792

++	 * second. This can be seen with a simple example. Suppose

11793

++	 * that there are two queues with the same weight, but that

11794

++	 * the first queue receives requests of 8 sectors, while the

11795

++	 * second queue receives requests of 1024 sectors. In

11796

++	 * addition, suppose that each of the two queues contains at

11797

++	 * most one request at a time, which implies that each queue

11798

++	 * always remains idle after it is served. Finally, after

11799

++	 * remaining idle, each queue receives very quickly a new

11800

++	 * request. It follows that the two queues are served

11801

++	 * alternatively, preempting each other if needed. This

11802

++	 * implies that, although both queues have the same weight,

11803

++	 * the queue with large requests receives a service that is

11804

++	 * 1024/8 times as high as the service received by the other

11805

++	 * queue.

11806

++	 *

11807

++	 * On the other hand, device idling is performed, and thus

11808

++	 * pure sector-domain guarantees are provided, for the

11809

++	 * following queues, which are likely to need stronger

11810

++	 * throughput guarantees: weight-raised queues, and queues

11811

++	 * with a higher weight than other queues. When such queues

11812

++	 * are active, sub-condition (i) is false, which triggers

11813

++	 * device idling.

11814

+ 	 *

11815

+ 	 * According to the above considerations, the next variable is

11816

+ 	 * true (only) if sub-condition (i) holds. To compute the

11817

+@@ -2579,7 +3103,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11818

+ 	 * the function bfq_symmetric_scenario(), but also check

11819

+ 	 * whether bfqq is being weight-raised, because

11820

+ 	 * bfq_symmetric_scenario() does not take into account also

11821

+-	 * weight-raised queues (see comments to

11822

++	 * weight-raised queues (see comments on

11823

+ 	 * bfq_weights_tree_add()).

11824

+ 	 *

11825

+ 	 * As a side note, it is worth considering that the above

11826

+@@ -2601,17 +3125,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11827

+ 	 * bfqq. Such a case is when bfqq became active in a burst of

11828

+ 	 * queue activations. Queues that became active during a large

11829

+ 	 * burst benefit only from throughput, as discussed in the

11830

+-	 * comments to bfq_handle_burst. Thus, if bfqq became active

11831

++	 * comments on bfq_handle_burst. Thus, if bfqq became active

11832

+ 	 * in a burst and not idling the device maximizes throughput,

11833

+ 	 * then the device must no be idled, because not idling the

11834

+ 	 * device provides bfqq and all other queues in the burst with

11835

+-	 * maximum benefit. Combining this and the two cases above, we

11836

+-	 * can now establish when idling is actually needed to

11837

+-	 * preserve service guarantees.

11838

++	 * maximum benefit. Combining this and the above case, we can

11839

++	 * now establish when idling is actually needed to preserve

11840

++	 * service guarantees.

11841

+ 	 */

11842

+ 	idling_needed_for_service_guarantees =

11843

+-		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&

11844

+-		!bfq_bfqq_in_large_burst(bfqq);

11845

++		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);

11846

+

11847

+ 	/*

11848

+ 	 * We have now all the components we need to compute the return

11849

+@@ -2621,6 +3144,14 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11850

+ 	 * 2) idling either boosts the throughput (without issues), or

11851

+ 	 *    is necessary to preserve service guarantees.

11852

+ 	 */

11853

++	bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d "

11854

++		     "wr_busy %d boosts %d IO-bound %d guar %d",

11855

++		     bfq_bfqq_sync(bfqq), idling_boosts_thr,

11856

++		     bfqd->wr_busy_queues,

11857

++		     idling_boosts_thr_without_issues,

11858

++		     bfq_bfqq_IO_bound(bfqq),

11859

++		     idling_needed_for_service_guarantees);

11860

++

11861

+ 	return bfq_bfqq_sync(bfqq) &&

11862

+ 		(idling_boosts_thr_without_issues ||

11863

+ 		 idling_needed_for_service_guarantees);

11864

+@@ -2632,7 +3163,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

11865

+  * 1) the queue must remain in service and cannot be expired, and

11866

+  * 2) the device must be idled to wait for the possible arrival of a new

11867

+  *    request for the queue.

11868

+- * See the comments to the function bfq_bfqq_may_idle for the reasons

11869

++ * See the comments on the function bfq_bfqq_may_idle for the reasons

11870

+  * why performing device idling is the best choice to boost the throughput

11871

+  * and preserve service guarantees when bfq_bfqq_may_idle itself

11872

+  * returns true.

11873

+@@ -2698,9 +3229,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

11874

+ 				 */

11875

+ 				bfq_clear_bfqq_wait_request(bfqq);

11876

+ 				del_timer(&bfqd->idle_slice_timer);

11877

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

11878

+ 				bfqg_stats_update_idle_time(bfqq_group(bfqq));

11879

+-#endif

11880

+ 			}

11881

+ 			goto keep_queue;

11882

+ 		}

11883

+@@ -2745,14 +3274,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11884

+ 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

11885

+

11886

+ 		/*

11887

+-		 * If the queue was activated in a burst, or

11888

+-		 * too much time has elapsed from the beginning

11889

+-		 * of this weight-raising period, or the queue has

11890

+-		 * exceeded the acceptable number of cooperations,

11891

+-		 * then end weight raising.

11892

++		 * If the queue was activated in a burst, or too much

11893

++		 * time has elapsed from the beginning of this

11894

++		 * weight-raising period, then end weight raising.

11895

+ 		 */

11896

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

11897

+-		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

11898

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

11899

+ 					   bfqq->wr_cur_max_time)) {

11900

+ 			bfqq->last_wr_start_finish = jiffies;

11901

+@@ -2814,10 +3340,25 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,

11902

+ 		goto expire;

11903

+ 	}

11904

+

11905

++	BUG_ON(bfqq->entity.budget < bfqq->entity.service);

11906

+ 	/* Finally, insert request into driver dispatch list. */

11907

+ 	bfq_bfqq_served(bfqq, service_to_charge);

11908

++

11909

++	BUG_ON(bfqq->entity.budget < bfqq->entity.service);

11910

++

11911

+ 	bfq_dispatch_insert(bfqd->queue, rq);

11912

+

11913

++	/*

11914

++	 * If weight raising has to terminate for bfqq, then next

11915

++	 * function causes an immediate update of bfqq's weight,

11916

++	 * without waiting for next activation. As a consequence, on

11917

++	 * expiration, bfqq will be timestamped as if has never been

11918

++	 * weight-raised during this service slot, even if it has

11919

++	 * received part or even most of the service as a

11920

++	 * weight-raised queue. This inflates bfqq's timestamps, which

11921

++	 * is beneficial, as bfqq is then more willing to leave the

11922

++	 * device immediately to possible other weight-raised queues.

11923

++	 */

11924

+ 	bfq_update_wr_data(bfqd, bfqq);

11925

+

11926

+ 	bfq_log_bfqq(bfqd, bfqq,

11927

+@@ -2833,9 +3374,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,

11928

+ 		bfqd->in_service_bic = RQ_BIC(rq);

11929

+ 	}

11930

+

11931

+-	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

11932

+-	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

11933

+-	    bfq_class_idle(bfqq)))

11934

++	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))

11935

+ 		goto expire;

11936

+

11937

+ 	return dispatched;

11938

+@@ -2881,8 +3420,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd)

11939

+ 		st = bfq_entity_service_tree(&bfqq->entity);

11940

+

11941

+ 		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

11942

+-		bfqq->max_budget = bfq_max_budget(bfqd);

11943

+

11944

++		bfqq->max_budget = bfq_max_budget(bfqd);

11945

+ 		bfq_forget_idle(st);

11946

+ 	}

11947

+

11948

+@@ -2895,9 +3434,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)

11949

+ {

11950

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

11951

+ 	struct bfq_queue *bfqq;

11952

+-	int max_dispatch;

11953

+

11954

+ 	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

11955

++

11956

+ 	if (bfqd->busy_queues == 0)

11957

+ 		return 0;

11958

+

11959

+@@ -2908,21 +3447,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)

11960

+ 	if (!bfqq)

11961

+ 		return 0;

11962

+

11963

+-	if (bfq_class_idle(bfqq))

11964

+-		max_dispatch = 1;

11965

+-

11966

+-	if (!bfq_bfqq_sync(bfqq))

11967

+-		max_dispatch = bfqd->bfq_max_budget_async_rq;

11968

+-

11969

+-	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {

11970

+-		if (bfqd->busy_queues > 1)

11971

+-			return 0;

11972

+-		if (bfqq->dispatched >= 4 * max_dispatch)

11973

+-			return 0;

11974

+-	}

11975

+-

11976

+-	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

11977

+-		return 0;

11978

++	BUG_ON(bfqq->entity.budget < bfqq->entity.service);

11979

+

11980

+ 	bfq_clear_bfqq_wait_request(bfqq);

11981

+ 	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

11982

+@@ -2933,6 +3458,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)

11983

+ 	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",

11984

+ 			bfq_bfqq_sync(bfqq) ? "sync" : "async");

11985

+

11986

++	BUG_ON(bfqq->entity.budget < bfqq->entity.service);

11987

+ 	return 1;

11988

+ }

11989

+

11990

+@@ -2949,11 +3475,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq)

11991

+ 	struct bfq_group *bfqg = bfqq_group(bfqq);

11992

+ #endif

11993

+

11994

+-	BUG_ON(atomic_read(&bfqq->ref) <= 0);

11995

++	BUG_ON(bfqq->ref <= 0);

11996

+

11997

+-	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

11998

+-		     atomic_read(&bfqq->ref));

11999

+-	if (!atomic_dec_and_test(&bfqq->ref))

12000

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);

12001

++	bfqq->ref--;

12002

++	if (bfqq->ref)

12003

+ 		return;

12004

+

12005

+ 	BUG_ON(rb_first(&bfqq->sort_list));

12006

+@@ -3007,8 +3533,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

12007

+ 		bfq_schedule_dispatch(bfqd);

12008

+ 	}

12009

+

12010

+-	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

12011

+-		     atomic_read(&bfqq->ref));

12012

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);

12013

+

12014

+ 	bfq_put_cooperator(bfqq);

12015

+

12016

+@@ -3019,26 +3544,7 @@ static void bfq_init_icq(struct io_cq *icq)

12017

+ {

12018

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

12019

+

12020

+-	bic->ttime.last_end_request = jiffies;

12021

+-	/*

12022

+-	 * A newly created bic indicates that the process has just

12023

+-	 * started doing I/O, and is probably mapping into memory its

12024

+-	 * executable and libraries: it definitely needs weight raising.

12025

+-	 * There is however the possibility that the process performs,

12026

+-	 * for a while, I/O close to some other process. EQM intercepts

12027

+-	 * this behavior and may merge the queue corresponding to the

12028

+-	 * process  with some other queue, BEFORE the weight of the queue

12029

+-	 * is raised. Merged queues are not weight-raised (they are assumed

12030

+-	 * to belong to processes that benefit only from high throughput).

12031

+-	 * If the merge is basically the consequence of an accident, then

12032

+-	 * the queue will be split soon and will get back its old weight.

12033

+-	 * It is then important to write down somewhere that this queue

12034

+-	 * does need weight raising, even if it did not make it to get its

12035

+-	 * weight raised before being merged. To this purpose, we overload

12036

+-	 * the field raising_time_left and assign 1 to it, to mark the queue

12037

+-	 * as needing weight raising.

12038

+-	 */

12039

+-	bic->wr_time_left = 1;

12040

++	bic->ttime.last_end_request = bfq_smallest_from_now();

12041

+ }

12042

+

12043

+ static void bfq_exit_icq(struct io_cq *icq)

12044

+@@ -3046,21 +3552,21 @@ static void bfq_exit_icq(struct io_cq *icq)

12045

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

12046

+ 	struct bfq_data *bfqd = bic_to_bfqd(bic);

12047

+

12048

+-	if (bic->bfqq[BLK_RW_ASYNC]) {

12049

+-		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

12050

+-		bic->bfqq[BLK_RW_ASYNC] = NULL;

12051

++	if (bic_to_bfqq(bic, false)) {

12052

++		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));

12053

++		bic_set_bfqq(bic, NULL, false);

12054

+ 	}

12055

+

12056

+-	if (bic->bfqq[BLK_RW_SYNC]) {

12057

++	if (bic_to_bfqq(bic, true)) {

12058

+ 		/*

12059

+ 		 * If the bic is using a shared queue, put the reference

12060

+ 		 * taken on the io_context when the bic started using a

12061

+ 		 * shared bfq_queue.

12062

+ 		 */

12063

+-		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

12064

++		if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))

12065

+ 			put_io_context(icq->ioc);

12066

+-		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

12067

+-		bic->bfqq[BLK_RW_SYNC] = NULL;

12068

++		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));

12069

++		bic_set_bfqq(bic, NULL, true);

12070

+ 	}

12071

+ }

12072

+

12073

+@@ -3068,7 +3574,8 @@ static void bfq_exit_icq(struct io_cq *icq)

12074

+  * Update the entity prio values; note that the new values will not

12075

+  * be used until the next (re)activation.

12076

+  */

12077

+-static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

12078

++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,

12079

++				     struct bfq_io_cq *bic)

12080

+ {

12081

+ 	struct task_struct *tsk = current;

12082

+ 	int ioprio_class;

12083

+@@ -3100,7 +3607,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b

12084

+ 		break;

12085

+ 	}

12086

+

12087

+-	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {

12088

++	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {

12089

+ 		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",

12090

+ 				 bfqq->new_ioprio);

12091

+ 		BUG();

12092

+@@ -3108,45 +3615,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b

12093

+

12094

+ 	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);

12095

+ 	bfqq->entity.prio_changed = 1;

12096

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

12097

++		     "set_next_ioprio_data: bic_class %d prio %d class %d",

12098

++		     ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);

12099

+ }

12100

+

12101

+ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)

12102

+ {

12103

+-	struct bfq_data *bfqd;

12104

+-	struct bfq_queue *bfqq, *new_bfqq;

12105

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

12106

++	struct bfq_queue *bfqq;

12107

+ 	unsigned long uninitialized_var(flags);

12108

+ 	int ioprio = bic->icq.ioc->ioprio;

12109

+

12110

+-	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

12111

+-				   &flags);

12112

+ 	/*

12113

+ 	 * This condition may trigger on a newly created bic, be sure to

12114

+ 	 * drop the lock before returning.

12115

+ 	 */

12116

+ 	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))

12117

+-		goto out;

12118

++		return;

12119

+

12120

+ 	bic->ioprio = ioprio;

12121

+

12122

+-	bfqq = bic->bfqq[BLK_RW_ASYNC];

12123

++	bfqq = bic_to_bfqq(bic, false);

12124

+ 	if (bfqq) {

12125

+-		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,

12126

+-					 GFP_ATOMIC);

12127

+-		if (new_bfqq) {

12128

+-			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

12129

+-			bfq_log_bfqq(bfqd, bfqq,

12130

+-				     "check_ioprio_change: bfqq %p %d",

12131

+-				     bfqq, atomic_read(&bfqq->ref));

12132

+-			bfq_put_queue(bfqq);

12133

+-		}

12134

++		bfq_put_queue(bfqq);

12135

++		bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);

12136

++		bic_set_bfqq(bic, bfqq, false);

12137

++		bfq_log_bfqq(bfqd, bfqq,

12138

++			     "check_ioprio_change: bfqq %p %d",

12139

++			     bfqq, bfqq->ref);

12140

+ 	}

12141

+

12142

+-	bfqq = bic->bfqq[BLK_RW_SYNC];

12143

++	bfqq = bic_to_bfqq(bic, true);

12144

+ 	if (bfqq)

12145

+ 		bfq_set_next_ioprio_data(bfqq, bic);

12146

+-

12147

+-out:

12148

+-	bfq_put_bfqd_unlock(bfqd, &flags);

12149

+ }

12150

+

12151

+ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12152

+@@ -3155,8 +3657,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12153

+ 	RB_CLEAR_NODE(&bfqq->entity.rb_node);

12154

+ 	INIT_LIST_HEAD(&bfqq->fifo);

12155

+ 	INIT_HLIST_NODE(&bfqq->burst_list_node);

12156

++	BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));

12157

+

12158

+-	atomic_set(&bfqq->ref, 0);

12159

++	bfqq->ref = 0;

12160

+ 	bfqq->bfqd = bfqd;

12161

+

12162

+ 	if (bic)

12163

+@@ -3166,6 +3669,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12164

+ 		if (!bfq_class_idle(bfqq))

12165

+ 			bfq_mark_bfqq_idle_window(bfqq);

12166

+ 		bfq_mark_bfqq_sync(bfqq);

12167

++		bfq_mark_bfqq_just_created(bfqq);

12168

+ 	} else

12169

+ 		bfq_clear_bfqq_sync(bfqq);

12170

+ 	bfq_mark_bfqq_IO_bound(bfqq);

12171

+@@ -3175,72 +3679,17 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12172

+ 	bfqq->pid = pid;

12173

+

12174

+ 	bfqq->wr_coeff = 1;

12175

+-	bfqq->last_wr_start_finish = 0;

12176

++	bfqq->last_wr_start_finish = bfq_smallest_from_now();

12177

++	bfqq->budget_timeout = bfq_smallest_from_now();

12178

++	bfqq->split_time = bfq_smallest_from_now();

12179

+ 	/*

12180

+ 	 * Set to the value for which bfqq will not be deemed as

12181

+ 	 * soft rt when it becomes backlogged.

12182

+ 	 */

12183

+-	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

12184

+-}

12185

+-

12186

+-static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

12187

+-					      struct bio *bio, int is_sync,

12188

+-					      struct bfq_io_cq *bic,

12189

+-					      gfp_t gfp_mask)

12190

+-{

12191

+-	struct bfq_group *bfqg;

12192

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

12193

+-	struct blkcg *blkcg;

12194

+-

12195

+-retry:

12196

+-	rcu_read_lock();

12197

+-

12198

+-	blkcg = bio_blkcg(bio);

12199

+-	bfqg = bfq_find_alloc_group(bfqd, blkcg);

12200

+-	/* bic always exists here */

12201

+-	bfqq = bic_to_bfqq(bic, is_sync);

12202

+-

12203

+-	/*

12204

+-	 * Always try a new alloc if we fall back to the OOM bfqq

12205

+-	 * originally, since it should just be a temporary situation.

12206

+-	 */

12207

+-	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

12208

+-		bfqq = NULL;

12209

+-		if (new_bfqq) {

12210

+-			bfqq = new_bfqq;

12211

+-			new_bfqq = NULL;

12212

+-		} else if (gfpflags_allow_blocking(gfp_mask)) {

12213

+-			rcu_read_unlock();

12214

+-			spin_unlock_irq(bfqd->queue->queue_lock);

12215

+-			new_bfqq = kmem_cache_alloc_node(bfq_pool,

12216

+-					gfp_mask | __GFP_ZERO,

12217

+-					bfqd->queue->node);

12218

+-			spin_lock_irq(bfqd->queue->queue_lock);

12219

+-			if (new_bfqq)

12220

+-				goto retry;

12221

+-		} else {

12222

+-			bfqq = kmem_cache_alloc_node(bfq_pool,

12223

+-					gfp_mask | __GFP_ZERO,

12224

+-					bfqd->queue->node);

12225

+-		}

12226

+-

12227

+-		if (bfqq) {

12228

+-			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

12229

+-                                      is_sync);

12230

+-			bfq_init_entity(&bfqq->entity, bfqg);

12231

+-			bfq_log_bfqq(bfqd, bfqq, "allocated");

12232

+-		} else {

12233

+-			bfqq = &bfqd->oom_bfqq;

12234

+-			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

12235

+-		}

12236

+-	}

12237

+-

12238

+-	if (new_bfqq)

12239

+-		kmem_cache_free(bfq_pool, new_bfqq);

12240

++	bfqq->soft_rt_next_start = bfq_greatest_from_now();

12241

+

12242

+-	rcu_read_unlock();

12243

+-

12244

+-	return bfqq;

12245

++	/* first request is almost certainly seeky */

12246

++	bfqq->seek_history = 1;

12247

+ }

12248

+

12249

+ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

12250

+@@ -3263,44 +3712,56 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

12251

+ }

12252

+

12253

+ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

12254

+-				       struct bio *bio, int is_sync,

12255

+-				       struct bfq_io_cq *bic, gfp_t gfp_mask)

12256

++				       struct bio *bio, bool is_sync,

12257

++				       struct bfq_io_cq *bic)

12258

+ {

12259

+ 	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

12260

+ 	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

12261

+ 	struct bfq_queue **async_bfqq = NULL;

12262

+-	struct bfq_queue *bfqq = NULL;

12263

++	struct bfq_queue *bfqq;

12264

++	struct bfq_group *bfqg;

12265

+

12266

+-	if (!is_sync) {

12267

+-		struct blkcg *blkcg;

12268

+-		struct bfq_group *bfqg;

12269

++	rcu_read_lock();

12270

+

12271

+-		rcu_read_lock();

12272

+-		blkcg = bio_blkcg(bio);

12273

+-		rcu_read_unlock();

12274

+-		bfqg = bfq_find_alloc_group(bfqd, blkcg);

12275

++	bfqg = bfq_find_alloc_group(bfqd,bio_blkcg(bio));

12276

++

12277

++	if (!is_sync) {

12278

+ 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

12279

+ 						  ioprio);

12280

+ 		bfqq = *async_bfqq;

12281

++		if (bfqq)

12282

++			goto out;

12283

+ 	}

12284

+

12285

+-	if (!bfqq)

12286

+-		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);

12287

++	bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,

12288

++				     bfqd->queue->node);

12289

++

12290

++	if (bfqq) {

12291

++		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

12292

++			      is_sync);

12293

++		bfq_init_entity(&bfqq->entity, bfqg);

12294

++		bfq_log_bfqq(bfqd, bfqq, "allocated");

12295

++	} else {

12296

++		bfqq = &bfqd->oom_bfqq;

12297

++		bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

12298

++		goto out;

12299

++	}

12300

+

12301

+ 	/*

12302

+ 	 * Pin the queue now that it's allocated, scheduler exit will

12303

+ 	 * prune it.

12304

+ 	 */

12305

+-	if (!is_sync && !(*async_bfqq)) {

12306

+-		atomic_inc(&bfqq->ref);

12307

++	if (async_bfqq) {

12308

++		bfqq->ref++;

12309

+ 		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

12310

+-			     bfqq, atomic_read(&bfqq->ref));

12311

++			     bfqq, bfqq->ref);

12312

+ 		*async_bfqq = bfqq;

12313

+ 	}

12314

+

12315

+-	atomic_inc(&bfqq->ref);

12316

+-	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

12317

+-		     atomic_read(&bfqq->ref));

12318

++out:

12319

++	bfqq->ref++;

12320

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);

12321

++	rcu_read_unlock();

12322

+ 	return bfqq;

12323

+ }

12324

+

12325

+@@ -3316,37 +3777,21 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,

12326

+ 				bic->ttime.ttime_samples;

12327

+ }

12328

+

12329

+-static void bfq_update_io_seektime(struct bfq_data *bfqd,

12330

+-				   struct bfq_queue *bfqq,

12331

+-				   struct request *rq)

12332

+-{

12333

+-	sector_t sdist;

12334

+-	u64 total;

12335

+-

12336

+-	if (bfqq->last_request_pos < blk_rq_pos(rq))

12337

+-		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

12338

+-	else

12339

+-		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

12340

+-

12341

+-	/*

12342

+-	 * Don't allow the seek distance to get too large from the

12343

+-	 * odd fragment, pagein, etc.

12344

+-	 */

12345

+-	if (bfqq->seek_samples == 0) /* first request, not really a seek */

12346

+-		sdist = 0;

12347

+-	else if (bfqq->seek_samples <= 60) /* second & third seek */

12348

+-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

12349

+-	else

12350

+-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

12351

+

12352

+-	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

12353

+-	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

12354

+-	total = bfqq->seek_total + (bfqq->seek_samples/2);

12355

+-	do_div(total, bfqq->seek_samples);

12356

+-	bfqq->seek_mean = (sector_t)total;

12357

++static void

12358

++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12359

++		       struct request *rq)

12360

++{

12361

++	sector_t sdist = 0;

12362

++	if (bfqq->last_request_pos) {

12363

++		if (bfqq->last_request_pos < blk_rq_pos(rq))

12364

++			sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

12365

++		else

12366

++			sdist = bfqq->last_request_pos - blk_rq_pos(rq);

12367

++	}

12368

+

12369

+-	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

12370

+-			(u64)bfqq->seek_mean);

12371

++	bfqq->seek_history <<= 1;

12372

++	bfqq->seek_history |= (sdist > BFQQ_SEEK_THR);

12373

+ }

12374

+

12375

+ /*

12376

+@@ -3364,7 +3809,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

12377

+ 		return;

12378

+

12379

+ 	/* Idle window just restored, statistics are meaningless. */

12380

+-	if (bfq_bfqq_just_split(bfqq))

12381

++	if (time_is_after_eq_jiffies(bfqq->split_time +

12382

++				     bfqd->bfq_wr_min_idle_time))

12383

+ 		return;

12384

+

12385

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

12386

+@@ -3404,22 +3850,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12387

+

12388

+ 	bfq_update_io_thinktime(bfqd, bic);

12389

+ 	bfq_update_io_seektime(bfqd, bfqq, rq);

12390

+-	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

12391

+-		bfq_clear_bfqq_constantly_seeky(bfqq);

12392

+-		if (!blk_queue_nonrot(bfqd->queue)) {

12393

+-			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

12394

+-			bfqd->const_seeky_busy_in_flight_queues--;

12395

+-		}

12396

+-	}

12397

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

12398

+ 	    !BFQQ_SEEKY(bfqq))

12399

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

12400

+-	bfq_clear_bfqq_just_split(bfqq);

12401

+

12402

+ 	bfq_log_bfqq(bfqd, bfqq,

12403

+-		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

12404

+-		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

12405

+-		     (long long unsigned)bfqq->seek_mean);

12406

++		     "rq_enqueued: idle_window=%d (seeky %d)",

12407

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));

12408

+

12409

+ 	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

12410

+

12411

+@@ -3433,14 +3870,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12412

+ 		 * is small and the queue is not to be expired, then

12413

+ 		 * just exit.

12414

+ 		 *

12415

+-		 * In this way, if the disk is being idled to wait for

12416

+-		 * a new request from the in-service queue, we avoid

12417

+-		 * unplugging the device and committing the disk to serve

12418

+-		 * just a small request. On the contrary, we wait for

12419

+-		 * the block layer to decide when to unplug the device:

12420

+-		 * hopefully, new requests will be merged to this one

12421

+-		 * quickly, then the device will be unplugged and

12422

+-		 * larger requests will be dispatched.

12423

++		 * In this way, if the device is being idled to wait

12424

++		 * for a new request from the in-service queue, we

12425

++		 * avoid unplugging the device and committing the

12426

++		 * device to serve just a small request. On the

12427

++		 * contrary, we wait for the block layer to decide

12428

++		 * when to unplug the device: hopefully, new requests

12429

++		 * will be merged to this one quickly, then the device

12430

++		 * will be unplugged and larger requests will be

12431

++		 * dispatched.

12432

+ 		 */

12433

+ 		if (small_req && !budget_timeout)

12434

+ 			return;

12435

+@@ -3453,9 +3891,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12436

+ 		 */

12437

+ 		bfq_clear_bfqq_wait_request(bfqq);

12438

+ 		del_timer(&bfqd->idle_slice_timer);

12439

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

12440

+ 		bfqg_stats_update_idle_time(bfqq_group(bfqq));

12441

+-#endif

12442

+

12443

+ 		/*

12444

+ 		 * The queue is not empty, because a new request just

12445

+@@ -3499,27 +3935,19 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)

12446

+ 			 */

12447

+ 			new_bfqq->allocated[rq_data_dir(rq)]++;

12448

+ 			bfqq->allocated[rq_data_dir(rq)]--;

12449

+-			atomic_inc(&new_bfqq->ref);

12450

++			new_bfqq->ref++;

12451

++			bfq_clear_bfqq_just_created(bfqq);

12452

+ 			bfq_put_queue(bfqq);

12453

+ 			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

12454

+ 				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

12455

+ 						bfqq, new_bfqq);

12456

+ 			rq->elv.priv[1] = new_bfqq;

12457

+ 			bfqq = new_bfqq;

12458

+-		} else

12459

+-			bfq_bfqq_increase_failed_cooperations(bfqq);

12460

++		}

12461

+ 	}

12462

+

12463

+ 	bfq_add_request(rq);

12464

+

12465

+-	/*

12466

+-	 * Here a newly-created bfq_queue has already started a weight-raising

12467

+-	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

12468

+-	 * from assigning it a full weight-raising period. See the detailed

12469

+-	 * comments about this field in bfq_init_icq().

12470

+-	 */

12471

+-	if (bfqq->bic)

12472

+-		bfqq->bic->wr_time_left = 0;

12473

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

12474

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

12475

+

12476

+@@ -3528,8 +3956,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)

12477

+

12478

+ static void bfq_update_hw_tag(struct bfq_data *bfqd)

12479

+ {

12480

+-	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

12481

+-				     bfqd->rq_in_driver);

12482

++	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,

12483

++				       bfqd->rq_in_driver);

12484

+

12485

+ 	if (bfqd->hw_tag == 1)

12486

+ 		return;

12487

+@@ -3560,43 +3988,41 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)

12488

+ 	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

12489

+ 		     blk_rq_sectors(rq), sync);

12490

+

12491

++	assert_spin_locked(bfqd->queue->queue_lock);

12492

+ 	bfq_update_hw_tag(bfqd);

12493

+

12494

+ 	BUG_ON(!bfqd->rq_in_driver);

12495

+ 	BUG_ON(!bfqq->dispatched);

12496

+ 	bfqd->rq_in_driver--;

12497

+ 	bfqq->dispatched--;

12498

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

12499

+ 	bfqg_stats_update_completion(bfqq_group(bfqq),

12500

+ 				     rq_start_time_ns(rq),

12501

+ 				     rq_io_start_time_ns(rq), rq->cmd_flags);

12502

+-#endif

12503

+

12504

+ 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

12505

++		BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

12506

++		/*

12507

++		 * Set budget_timeout (which we overload to store the

12508

++		 * time at which the queue remains with no backlog and

12509

++		 * no outstanding request; used by the weight-raising

12510

++		 * mechanism).

12511

++		 */

12512

++		bfqq->budget_timeout = jiffies;

12513

++

12514

+ 		bfq_weights_tree_remove(bfqd, &bfqq->entity,

12515

+ 					&bfqd->queue_weights_tree);

12516

+-		if (!blk_queue_nonrot(bfqd->queue)) {

12517

+-			BUG_ON(!bfqd->busy_in_flight_queues);

12518

+-			bfqd->busy_in_flight_queues--;

12519

+-			if (bfq_bfqq_constantly_seeky(bfqq)) {

12520

+-				BUG_ON(!bfqd->

12521

+-					const_seeky_busy_in_flight_queues);

12522

+-				bfqd->const_seeky_busy_in_flight_queues--;

12523

+-			}

12524

+-		}

12525

+ 	}

12526

+

12527

+-	if (sync) {

12528

+-		bfqd->sync_flight--;

12529

+-		RQ_BIC(rq)->ttime.last_end_request = jiffies;

12530

+-	}

12531

++	RQ_BIC(rq)->ttime.last_end_request = jiffies;

12532

+

12533

+ 	/*

12534

+-	 * If we are waiting to discover whether the request pattern of the

12535

+-	 * task associated with the queue is actually isochronous, and

12536

+-	 * both requisites for this condition to hold are satisfied, then

12537

+-	 * compute soft_rt_next_start (see the comments to the function

12538

+-	 * bfq_bfqq_softrt_next_start()).

12539

++	 * If we are waiting to discover whether the request pattern

12540

++	 * of the task associated with the queue is actually

12541

++	 * isochronous, and both requisites for this condition to hold

12542

++	 * are now satisfied, then compute soft_rt_next_start (see the

12543

++	 * comments on the function bfq_bfqq_softrt_next_start()). We

12544

++	 * schedule this delayed check when bfqq expires, if it still

12545

++	 * has in-flight requests.

12546

+ 	 */

12547

+ 	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

12548

+ 	    RB_EMPTY_ROOT(&bfqq->sort_list))

12549

+@@ -3608,10 +4034,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)

12550

+ 	 * or if we want to idle in case it has no pending requests.

12551

+ 	 */

12552

+ 	if (bfqd->in_service_queue == bfqq) {

12553

+-		if (bfq_bfqq_budget_new(bfqq))

12554

+-			bfq_set_budget_timeout(bfqd);

12555

+-

12556

+-		if (bfq_bfqq_must_idle(bfqq)) {

12557

++		if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {

12558

+ 			bfq_arm_slice_timer(bfqd);

12559

+ 			goto out;

12560

+ 		} else if (bfq_may_expire_for_budg_timeout(bfqq))

12561

+@@ -3682,14 +4105,14 @@ static void bfq_put_request(struct request *rq)

12562

+ 		rq->elv.priv[1] = NULL;

12563

+

12564

+ 		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

12565

+-			     bfqq, atomic_read(&bfqq->ref));

12566

++			     bfqq, bfqq->ref);

12567

+ 		bfq_put_queue(bfqq);

12568

+ 	}

12569

+ }

12570

+

12571

+ /*

12572

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

12573

+- * was the last process referring to said bfqq.

12574

++ * was the last process referring to that bfqq.

12575

+  */

12576

+ static struct bfq_queue *

12577

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

12578

+@@ -3727,11 +4150,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

12579

+ 	unsigned long flags;

12580

+ 	bool split = false;

12581

+

12582

+-	might_sleep_if(gfpflags_allow_blocking(gfp_mask));

12583

+-

12584

+-	bfq_check_ioprio_change(bic, bio);

12585

+-

12586

+ 	spin_lock_irqsave(q->queue_lock, flags);

12587

++	bfq_check_ioprio_change(bic, bio);

12588

+

12589

+ 	if (!bic)

12590

+ 		goto queue_fail;

12591

+@@ -3741,23 +4161,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

12592

+ new_queue:

12593

+ 	bfqq = bic_to_bfqq(bic, is_sync);

12594

+ 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

12595

+-		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);

12596

++		if (bfqq)

12597

++			bfq_put_queue(bfqq);

12598

++		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);

12599

++		BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));

12600

++

12601

+ 		bic_set_bfqq(bic, bfqq, is_sync);

12602

+ 		if (split && is_sync) {

12603

++			bfq_log_bfqq(bfqd, bfqq,

12604

++				     "set_request: was_in_list %d "

12605

++				     "was_in_large_burst %d "

12606

++				     "large burst in progress %d",

12607

++				     bic->was_in_burst_list,

12608

++				     bic->saved_in_large_burst,

12609

++				     bfqd->large_burst);

12610

++

12611

+ 			if ((bic->was_in_burst_list && bfqd->large_burst) ||

12612

+-			    bic->saved_in_large_burst)

12613

++			    bic->saved_in_large_burst) {

12614

++				bfq_log_bfqq(bfqd, bfqq,

12615

++					     "set_request: marking in "

12616

++					     "large burst");

12617

+ 				bfq_mark_bfqq_in_large_burst(bfqq);

12618

+-			else {

12619

+-			    bfq_clear_bfqq_in_large_burst(bfqq);

12620

+-			    if (bic->was_in_burst_list)

12621

+-			       hlist_add_head(&bfqq->burst_list_node,

12622

+-				              &bfqd->burst_list);

12623

++			} else {

12624

++				bfq_log_bfqq(bfqd, bfqq,

12625

++					     "set_request: clearing in "

12626

++					     "large burst");

12627

++				bfq_clear_bfqq_in_large_burst(bfqq);

12628

++				if (bic->was_in_burst_list)

12629

++					hlist_add_head(&bfqq->burst_list_node,

12630

++						       &bfqd->burst_list);

12631

+ 			}

12632

++			bfqq->split_time = jiffies;

12633

+ 		}

12634

+ 	} else {

12635

+ 		/* If the queue was seeky for too long, break it apart. */

12636

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

12637

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

12638

++

12639

++			/* Update bic before losing reference to bfqq */

12640

++			if (bfq_bfqq_in_large_burst(bfqq))

12641

++				bic->saved_in_large_burst = true;

12642

++

12643

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

12644

+ 			split = true;

12645

+ 			if (!bfqq)

12646

+@@ -3766,9 +4210,8 @@ new_queue:

12647

+ 	}

12648

+

12649

+ 	bfqq->allocated[rw]++;

12650

+-	atomic_inc(&bfqq->ref);

12651

+-	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

12652

+-		     atomic_read(&bfqq->ref));

12653

++	bfqq->ref++;

12654

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);

12655

+

12656

+ 	rq->elv.priv[0] = bic;

12657

+ 	rq->elv.priv[1] = bfqq;

12658

+@@ -3783,7 +4226,6 @@ new_queue:

12659

+ 	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

12660

+ 		bfqq->bic = bic;

12661

+ 		if (split) {

12662

+-			bfq_mark_bfqq_just_split(bfqq);

12663

+ 			/*

12664

+ 			 * If the queue has just been split from a shared

12665

+ 			 * queue, restore the idle window and the possible

12666

+@@ -3793,6 +4235,9 @@ new_queue:

12667

+ 		}

12668

+ 	}

12669

+

12670

++	if (unlikely(bfq_bfqq_just_created(bfqq)))

12671

++		bfq_handle_burst(bfqd, bfqq);

12672

++

12673

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

12674

+

12675

+ 	return 0;

12676

+@@ -3872,6 +4317,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

12677

+ 	cancel_work_sync(&bfqd->unplug_work);

12678

+ }

12679

+

12680

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

12681

+ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,

12682

+ 					struct bfq_queue **bfqq_ptr)

12683

+ {

12684

+@@ -3880,9 +4326,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,

12685

+

12686

+ 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

12687

+ 	if (bfqq) {

12688

+-		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

12689

++		bfq_bfqq_move(bfqd, bfqq, root_group);

12690

+ 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

12691

+-			     bfqq, atomic_read(&bfqq->ref));

12692

++			     bfqq, bfqq->ref);

12693

+ 		bfq_put_queue(bfqq);

12694

+ 		*bfqq_ptr = NULL;

12695

+ 	}

12696

+@@ -3904,6 +4350,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

12697

+

12698

+ 	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

12699

+ }

12700

++#endif

12701

+

12702

+ static void bfq_exit_queue(struct elevator_queue *e)

12703

+ {

12704

+@@ -3923,8 +4370,6 @@ static void bfq_exit_queue(struct elevator_queue *e)

12705

+

12706

+ 	bfq_shutdown_timer_wq(bfqd);

12707

+

12708

+-	synchronize_rcu();

12709

+-

12710

+ 	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

12711

+

12712

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

12713

+@@ -3973,11 +4418,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

12714

+ 	 * will not attempt to free it.

12715

+ 	 */

12716

+ 	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);

12717

+-	atomic_inc(&bfqd->oom_bfqq.ref);

12718

++	bfqd->oom_bfqq.ref++;

12719

+ 	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

12720

+ 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;

12721

+ 	bfqd->oom_bfqq.entity.new_weight =

12722

+ 		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);

12723

++

12724

++	/* oom_bfqq does not participate to bursts */

12725

++	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);

12726

+ 	/*

12727

+ 	 * Trigger weight initialization, according to ioprio, at the

12728

+ 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

12729

+@@ -3996,9 +4444,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

12730

+ 		goto out_free;

12731

+ 	bfq_init_root_group(bfqd->root_group, bfqd);

12732

+ 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

12733

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

12734

+-	bfqd->active_numerous_groups = 0;

12735

+-#endif

12736

+

12737

+ 	init_timer(&bfqd->idle_slice_timer);

12738

+ 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

12739

+@@ -4023,20 +4468,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

12740

+ 	bfqd->bfq_back_penalty = bfq_back_penalty;

12741

+ 	bfqd->bfq_slice_idle = bfq_slice_idle;

12742

+ 	bfqd->bfq_class_idle_last_service = 0;

12743

+-	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

12744

+-	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

12745

+-	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

12746

++	bfqd->bfq_timeout = bfq_timeout;

12747

+

12748

+-	bfqd->bfq_coop_thresh = 2;

12749

+-	bfqd->bfq_failed_cooperations = 7000;

12750

+ 	bfqd->bfq_requests_within_timer = 120;

12751

+

12752

+-	bfqd->bfq_large_burst_thresh = 11;

12753

+-	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

12754

++	bfqd->bfq_large_burst_thresh = 8;

12755

++	bfqd->bfq_burst_interval = msecs_to_jiffies(180);

12756

+

12757

+ 	bfqd->low_latency = true;

12758

+

12759

+-	bfqd->bfq_wr_coeff = 20;

12760

++	/*

12761

++	 * Trade-off between responsiveness and fairness.

12762

++	 */

12763

++	bfqd->bfq_wr_coeff = 30;

12764

+ 	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

12765

+ 	bfqd->bfq_wr_max_time = 0;

12766

+ 	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

12767

+@@ -4048,16 +4492,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

12768

+ 					      * video.

12769

+ 					      */

12770

+ 	bfqd->wr_busy_queues = 0;

12771

+-	bfqd->busy_in_flight_queues = 0;

12772

+-	bfqd->const_seeky_busy_in_flight_queues = 0;

12773

+

12774

+ 	/*

12775

+-	 * Begin by assuming, optimistically, that the device peak rate is

12776

+-	 * equal to the highest reference rate.

12777

++	 * Begin by assuming, optimistically, that the device is a

12778

++	 * high-speed one, and that its peak rate is equal to 2/3 of

12779

++	 * the highest reference rate.

12780

+ 	 */

12781

+ 	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

12782

+ 			T_fast[blk_queue_nonrot(bfqd->queue)];

12783

+-	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

12784

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;

12785

+ 	bfqd->device_speed = BFQ_BFQD_FAST;

12786

+

12787

+ 	return 0;

12788

+@@ -4161,10 +4604,8 @@ SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

12789

+ SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

12790

+ SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

12791

+ SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

12792

+-SHOW_FUNCTION(bfq_max_budget_async_rq_show,

12793

+-	      bfqd->bfq_max_budget_async_rq, 0);

12794

+-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

12795

+-SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

12796

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);

12797

++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);

12798

+ SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

12799

+ SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

12800

+ SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

12801

+@@ -4199,10 +4640,6 @@ STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

12802

+ STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

12803

+ 		INT_MAX, 0);

12804

+ STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

12805

+-STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

12806

+-		1, INT_MAX, 0);

12807

+-STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

12808

+-		INT_MAX, 1);

12809

+ STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

12810

+ STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

12811

+ STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

12812

+@@ -4224,10 +4661,8 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,

12813

+

12814

+ static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

12815

+ {

12816

+-	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

12817

+-

12818

+ 	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

12819

+-		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

12820

++		return bfq_calc_max_budget(bfqd);

12821

+ 	else

12822

+ 		return bfq_default_max_budget;

12823

+ }

12824

+@@ -4252,6 +4687,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,

12825

+ 	return ret;

12826

+ }

12827

+

12828

++/* 

12829

++ * Leaving this name to preserve name compatibility with cfq

12830

++ * parameters, but this timeout is used for both sync and async.

12831

++ */

12832

+ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

12833

+ 				      const char *page, size_t count)

12834

+ {

12835

+@@ -4264,13 +4703,31 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

12836

+ 	else if (__data > INT_MAX)

12837

+ 		__data = INT_MAX;

12838

+

12839

+-	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

12840

++	bfqd->bfq_timeout = msecs_to_jiffies(__data);

12841

+ 	if (bfqd->bfq_user_max_budget == 0)

12842

+ 		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

12843

+

12844

+ 	return ret;

12845

+ }

12846

+

12847

++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,

12848

++				     const char *page, size_t count)

12849

++{

12850

++	struct bfq_data *bfqd = e->elevator_data;

12851

++	unsigned long uninitialized_var(__data);

12852

++	int ret = bfq_var_store(&__data, (page), count);

12853

++

12854

++	if (__data > 1)

12855

++		__data = 1;

12856

++	if (!bfqd->strict_guarantees && __data == 1

12857

++	    && bfqd->bfq_slice_idle < msecs_to_jiffies(8))

12858

++		bfqd->bfq_slice_idle = msecs_to_jiffies(8);

12859

++

12860

++	bfqd->strict_guarantees = __data;

12861

++

12862

++	return ret;

12863

++}

12864

++

12865

+ static ssize_t bfq_low_latency_store(struct elevator_queue *e,

12866

+ 				     const char *page, size_t count)

12867

+ {

12868

+@@ -4297,9 +4754,8 @@ static struct elv_fs_entry bfq_attrs[] = {

12869

+ 	BFQ_ATTR(back_seek_penalty),

12870

+ 	BFQ_ATTR(slice_idle),

12871

+ 	BFQ_ATTR(max_budget),

12872

+-	BFQ_ATTR(max_budget_async_rq),

12873

+ 	BFQ_ATTR(timeout_sync),

12874

+-	BFQ_ATTR(timeout_async),

12875

++	BFQ_ATTR(strict_guarantees),

12876

+ 	BFQ_ATTR(low_latency),

12877

+ 	BFQ_ATTR(wr_coeff),

12878

+ 	BFQ_ATTR(wr_max_time),

12879

+@@ -4342,9 +4798,28 @@ static struct elevator_type iosched_bfq = {

12880

+ 	.elevator_owner =	THIS_MODULE,

12881

+ };

12882

+

12883

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

12884

++static struct blkcg_policy blkcg_policy_bfq = {

12885

++	.dfl_cftypes		= bfq_blkg_files,

12886

++	.legacy_cftypes		= bfq_blkcg_legacy_files,

12887

++

12888

++	.cpd_alloc_fn		= bfq_cpd_alloc,

12889

++	.cpd_init_fn		= bfq_cpd_init,

12890

++	.cpd_bind_fn	        = bfq_cpd_init,

12891

++	.cpd_free_fn		= bfq_cpd_free,

12892

++

12893

++	.pd_alloc_fn		= bfq_pd_alloc,

12894

++	.pd_init_fn		= bfq_pd_init,

12895

++	.pd_offline_fn		= bfq_pd_offline,

12896

++	.pd_free_fn		= bfq_pd_free,

12897

++	.pd_reset_stats_fn	= bfq_pd_reset_stats,

12898

++};

12899

++#endif

12900

++

12901

+ static int __init bfq_init(void)

12902

+ {

12903

+ 	int ret;

12904

++	char msg[50] = "BFQ I/O-scheduler: v8";

12905

+

12906

+ 	/*

12907

+ 	 * Can be 0 on HZ < 1000 setups.

12908

+@@ -4352,9 +4827,6 @@ static int __init bfq_init(void)

12909

+ 	if (bfq_slice_idle == 0)

12910

+ 		bfq_slice_idle = 1;

12911

+

12912

+-	if (bfq_timeout_async == 0)

12913

+-		bfq_timeout_async = 1;

12914

+-

12915

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

12916

+ 	ret = blkcg_policy_register(&blkcg_policy_bfq);

12917

+ 	if (ret)

12918

+@@ -4370,23 +4842,34 @@ static int __init bfq_init(void)

12919

+ 	 * installed on the reference devices (see the comments before the

12920

+ 	 * definitions of the two arrays).

12921

+ 	 */

12922

+-	T_slow[0] = msecs_to_jiffies(2600);

12923

+-	T_slow[1] = msecs_to_jiffies(1000);

12924

+-	T_fast[0] = msecs_to_jiffies(5500);

12925

+-	T_fast[1] = msecs_to_jiffies(2000);

12926

++	T_slow[0] = msecs_to_jiffies(3500);

12927

++	T_slow[1] = msecs_to_jiffies(1500);

12928

++	T_fast[0] = msecs_to_jiffies(8000);

12929

++	T_fast[1] = msecs_to_jiffies(3000);

12930

+

12931

+ 	/*

12932

+-	 * Thresholds that determine the switch between speed classes (see

12933

+-	 * the comments before the definition of the array).

12934

++	 * Thresholds that determine the switch between speed classes

12935

++	 * (see the comments before the definition of the array

12936

++	 * device_speed_thresh). These thresholds are biased towards

12937

++	 * transitions to the fast class. This is safer than the

12938

++	 * opposite bias. In fact, a wrong transition to the slow

12939

++	 * class results in short weight-raising periods, because the

12940

++	 * speed of the device then tends to be higher that the

12941

++	 * reference peak rate. On the opposite end, a wrong

12942

++	 * transition to the fast class tends to increase

12943

++	 * weight-raising periods, because of the opposite reason.

12944

+ 	 */

12945

+-	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

12946

+-	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

12947

++	device_speed_thresh[0] = (4 * R_slow[0]) / 3;

12948

++	device_speed_thresh[1] = (4 * R_slow[1]) / 3;

12949

+

12950

+ 	ret = elv_register(&iosched_bfq);

12951

+ 	if (ret)

12952

+ 		goto err_pol_unreg;

12953

+

12954

+-	pr_info("BFQ I/O-scheduler: v7r11");

12955

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

12956

++	strcat(msg, " (with cgroups support)");

12957

++#endif

12958

++	pr_info("%s", msg);

12959

+

12960

+ 	return 0;

12961

+

12962

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

12963

+index a64fec1..e54b149 100644

12964

+--- a/block/bfq-sched.c

12965

++++ b/block/bfq-sched.c

12966

+@@ -7,9 +7,11 @@

12967

+  * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

12968

+  *		      Paolo Valente <paolo.valente@×××××××.it>

12969

+  *

12970

+- * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

12971

++ * Copyright (C) 2016 Paolo Valente <paolo.valente@×××××××.it>

12972

+  */

12973

+

12974

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);

12975

++

12976

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

12977

+ #define for_each_entity(entity)	\

12978

+ 	for (; entity ; entity = entity->parent)

12979

+@@ -22,8 +24,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

12980

+ 						 int extract,

12981

+ 						 struct bfq_data *bfqd);

12982

+

12983

+-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);

12984

+-

12985

+ static void bfq_update_budget(struct bfq_entity *next_in_service)

12986

+ {

12987

+ 	struct bfq_entity *bfqg_entity;

12988

+@@ -48,6 +48,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)

12989

+ static int bfq_update_next_in_service(struct bfq_sched_data *sd)

12990

+ {

12991

+ 	struct bfq_entity *next_in_service;

12992

++	struct bfq_queue *bfqq;

12993

+

12994

+ 	if (sd->in_service_entity)

12995

+ 		/* will update/requeue at the end of service */

12996

+@@ -65,14 +66,29 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd)

12997

+

12998

+ 	if (next_in_service)

12999

+ 		bfq_update_budget(next_in_service);

13000

++	else

13001

++		goto exit;

13002

+

13003

++	bfqq = bfq_entity_to_bfqq(next_in_service);

13004

++	if (bfqq)

13005

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

13006

++			     "update_next_in_service: chosen this queue");

13007

++	else {

13008

++		struct bfq_group *bfqg =

13009

++			container_of(next_in_service,

13010

++				     struct bfq_group, entity);

13011

++

13012

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13013

++			     "update_next_in_service: chosen this entity");

13014

++	}

13015

++exit:

13016

+ 	return 1;

13017

+ }

13018

+

13019

+ static void bfq_check_next_in_service(struct bfq_sched_data *sd,

13020

+ 				      struct bfq_entity *entity)

13021

+ {

13022

+-	BUG_ON(sd->next_in_service != entity);

13023

++	WARN_ON(sd->next_in_service != entity);

13024

+ }

13025

+ #else

13026

+ #define for_each_entity(entity)	\

13027

+@@ -151,20 +167,35 @@ static u64 bfq_delta(unsigned long service, unsigned long weight)

13028

+ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)

13029

+ {

13030

+ 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13031

+-

13032

++	unsigned long long start, finish, delta ;

13033

+ 	BUG_ON(entity->weight == 0);

13034

+

13035

+ 	entity->finish = entity->start +

13036

+ 		bfq_delta(service, entity->weight);

13037

+

13038

++	start = ((entity->start>>10)*1000)>>12;

13039

++	finish = ((entity->finish>>10)*1000)>>12;

13040

++	delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;

13041

++

13042

+ 	if (bfqq) {

13043

+ 		bfq_log_bfqq(bfqq->bfqd, bfqq,

13044

+ 			"calc_finish: serv %lu, w %d",

13045

+ 			service, entity->weight);

13046

+ 		bfq_log_bfqq(bfqq->bfqd, bfqq,

13047

+ 			"calc_finish: start %llu, finish %llu, delta %llu",

13048

+-			entity->start, entity->finish,

13049

+-			bfq_delta(service, entity->weight));

13050

++			start, finish, delta);

13051

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13052

++	} else {

13053

++		struct bfq_group *bfqg =

13054

++			container_of(entity, struct bfq_group, entity);

13055

++

13056

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13057

++			"calc_finish group: serv %lu, w %d",

13058

++			     service, entity->weight);

13059

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13060

++			"calc_finish group: start %llu, finish %llu, delta %llu",

13061

++			start, finish, delta);

13062

++#endif

13063

+ 	}

13064

+ }

13065

+

13066

+@@ -386,8 +417,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,

13067

+ 		BUG_ON(!bfqg);

13068

+ 		BUG_ON(!bfqd);

13069

+ 		bfqg->active_entities++;

13070

+-		if (bfqg->active_entities == 2)

13071

+-			bfqd->active_numerous_groups++;

13072

+ 	}

13073

+ #endif

13074

+ }

13075

+@@ -399,7 +428,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,

13076

+ static unsigned short bfq_ioprio_to_weight(int ioprio)

13077

+ {

13078

+ 	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

13079

+-	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;

13080

++	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF ;

13081

+ }

13082

+

13083

+ /**

13084

+@@ -422,9 +451,9 @@ static void bfq_get_entity(struct bfq_entity *entity)

13085

+ 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13086

+

13087

+ 	if (bfqq) {

13088

+-		atomic_inc(&bfqq->ref);

13089

++		bfqq->ref++;

13090

+ 		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

13091

+-			     bfqq, atomic_read(&bfqq->ref));

13092

++			     bfqq, bfqq->ref);

13093

+ 	}

13094

+ }

13095

+

13096

+@@ -499,10 +528,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,

13097

+ 		BUG_ON(!bfqd);

13098

+ 		BUG_ON(!bfqg->active_entities);

13099

+ 		bfqg->active_entities--;

13100

+-		if (bfqg->active_entities == 1) {

13101

+-			BUG_ON(!bfqd->active_numerous_groups);

13102

+-			bfqd->active_numerous_groups--;

13103

+-		}

13104

+ 	}

13105

+ #endif

13106

+ }

13107

+@@ -552,7 +577,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,

13108

+ 	if (bfqq) {

13109

+ 		sd = entity->sched_data;

13110

+ 		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

13111

+-			     bfqq, atomic_read(&bfqq->ref));

13112

++			     bfqq, bfqq->ref);

13113

+ 		bfq_put_queue(bfqq);

13114

+ 	}

13115

+ }

13116

+@@ -628,12 +653,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

13117

+ 		if (entity->new_weight != entity->orig_weight) {

13118

+ 			if (entity->new_weight < BFQ_MIN_WEIGHT ||

13119

+ 			    entity->new_weight > BFQ_MAX_WEIGHT) {

13120

+-				printk(KERN_CRIT "update_weight_prio: "

13121

+-						 "new_weight %d\n",

13122

++				pr_crit("update_weight_prio: new_weight %d\n",

13123

+ 					entity->new_weight);

13124

+-				BUG();

13125

++				if (entity->new_weight < BFQ_MIN_WEIGHT)

13126

++					entity->new_weight = BFQ_MIN_WEIGHT;

13127

++				else

13128

++					entity->new_weight = BFQ_MAX_WEIGHT;

13129

+ 			}

13130

+-			entity->orig_weight = entity->new_weight;

13131

++		       	entity->orig_weight = entity->new_weight;

13132

+ 			if (bfqq)

13133

+ 				bfqq->ioprio =

13134

+ 				  bfq_weight_to_ioprio(entity->orig_weight);

13135

+@@ -708,7 +735,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)

13136

+ 		st = bfq_entity_service_tree(entity);

13137

+

13138

+ 		entity->service += served;

13139

+-		BUG_ON(entity->service > entity->budget);

13140

++

13141

+ 		BUG_ON(st->wsum == 0);

13142

+

13143

+ 		st->vtime += bfq_delta(served, st->wsum);

13144

+@@ -717,31 +744,69 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)

13145

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

13146

+ 	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));

13147

+ #endif

13148

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);

13149

++	st = bfq_entity_service_tree(&bfqq->entity);

13150

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",

13151

++		     served,  ((st->vtime>>10)*1000)>>12, st);

13152

+ }

13153

+

13154

+ /**

13155

+- * bfq_bfqq_charge_full_budget - set the service to the entity budget.

13156

++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length

13157

++ *			  of the time interval during which bfqq has been in

13158

++ *			  service.

13159

++ * @bfqd: the device

13160

+  * @bfqq: the queue that needs a service update.

13161

++ * @time_ms: the amount of time during which the queue has received service

13162

++ *

13163

++ * If a queue does not consume its budget fast enough, then providing

13164

++ * the queue with service fairness may impair throughput, more or less

13165

++ * severely. For this reason, queues that consume their budget slowly

13166

++ * are provided with time fairness instead of service fairness. This

13167

++ * goal is achieved through the BFQ scheduling engine, even if such an

13168

++ * engine works in the service, and not in the time domain. The trick

13169

++ * is charging these queues with an inflated amount of service, equal

13170

++ * to the amount of service that they would have received during their

13171

++ * service slot if they had been fast, i.e., if their requests had

13172

++ * been dispatched at a rate equal to the estimated peak rate.

13173

+  *

13174

+- * When it's not possible to be fair in the service domain, because

13175

+- * a queue is not consuming its budget fast enough (the meaning of

13176

+- * fast depends on the timeout parameter), we charge it a full

13177

+- * budget.  In this way we should obtain a sort of time-domain

13178

+- * fairness among all the seeky/slow queues.

13179

++ * It is worth noting that time fairness can cause important

13180

++ * distortions in terms of bandwidth distribution, on devices with

13181

++ * internal queueing. The reason is that I/O requests dispatched

13182

++ * during the service slot of a queue may be served after that service

13183

++ * slot is finished, and may have a total processing time loosely

13184

++ * correlated with the duration of the service slot. This is

13185

++ * especially true for short service slots.

13186

+  */

13187

+-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

13188

++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13189

++				 unsigned long time_ms)

13190

+ {

13191

+ 	struct bfq_entity *entity = &bfqq->entity;

13192

++	int tot_serv_to_charge = entity->service;

13193

++	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);

13194

++

13195

++	if (time_ms > 0 && time_ms < timeout_ms)

13196

++		tot_serv_to_charge =

13197

++			(bfqd->bfq_max_budget * time_ms) / timeout_ms;

13198

++

13199

++	if (tot_serv_to_charge < entity->service)

13200

++		tot_serv_to_charge = entity->service;

13201

++

13202

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

13203

++		     "charge_time: %lu/%u ms, %d/%d/%d sectors",

13204

++		     time_ms, timeout_ms, entity->service,

13205

++		     tot_serv_to_charge, entity->budget);

13206

+

13207

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

13208

++	/* Increase budget to avoid inconsistencies */

13209

++	if (tot_serv_to_charge > entity->budget)

13210

++		entity->budget = tot_serv_to_charge;

13211

+

13212

+-	bfq_bfqq_served(bfqq, entity->budget - entity->service);

13213

++	bfq_bfqq_served(bfqq,

13214

++			max_t(int, 0, tot_serv_to_charge - entity->service));

13215

+ }

13216

+

13217

+ /**

13218

+  * __bfq_activate_entity - activate an entity.

13219

+  * @entity: the entity being activated.

13220

++ * @non_blocking_wait_rq: true if this entity was waiting for a request

13221

+  *

13222

+  * Called whenever an entity is activated, i.e., it is not active and one

13223

+  * of its children receives a new request, or has to be reactivated due to

13224

+@@ -749,11 +814,16 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

13225

+  * service received if @entity is active) of the queue to calculate its

13226

+  * timestamps.

13227

+  */

13228

+-static void __bfq_activate_entity(struct bfq_entity *entity)

13229

++static void __bfq_activate_entity(struct bfq_entity *entity,

13230

++				  bool non_blocking_wait_rq)

13231

+ {

13232

+ 	struct bfq_sched_data *sd = entity->sched_data;

13233

+ 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

13234

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13235

++	bool backshifted = false;

13236

+

13237

++	BUG_ON(!sd);

13238

++	BUG_ON(!st);

13239

+ 	if (entity == sd->in_service_entity) {

13240

+ 		BUG_ON(entity->tree);

13241

+ 		/*

13242

+@@ -771,45 +841,133 @@ static void __bfq_activate_entity(struct bfq_entity *entity)

13243

+ 		 * old start time.

13244

+ 		 */

13245

+ 		bfq_active_extract(st, entity);

13246

+-	} else if (entity->tree == &st->idle) {

13247

+-		/*

13248

+-		 * Must be on the idle tree, bfq_idle_extract() will

13249

+-		 * check for that.

13250

+-		 */

13251

+-		bfq_idle_extract(st, entity);

13252

+-		entity->start = bfq_gt(st->vtime, entity->finish) ?

13253

+-				       st->vtime : entity->finish;

13254

+ 	} else {

13255

+-		/*

13256

+-		 * The finish time of the entity may be invalid, and

13257

+-		 * it is in the past for sure, otherwise the queue

13258

+-		 * would have been on the idle tree.

13259

+-		 */

13260

+-		entity->start = st->vtime;

13261

+-		st->wsum += entity->weight;

13262

+-		bfq_get_entity(entity);

13263

++		unsigned long long min_vstart;

13264

++

13265

++		/* See comments on bfq_fqq_update_budg_for_activation */

13266

++		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {

13267

++			backshifted = true;

13268

++			min_vstart = entity->finish;

13269

++		} else

13270

++			min_vstart = st->vtime;

13271

+

13272

+-		BUG_ON(entity->on_st);

13273

+-		entity->on_st = 1;

13274

++		if (entity->tree == &st->idle) {

13275

++			/*

13276

++			 * Must be on the idle tree, bfq_idle_extract() will

13277

++			 * check for that.

13278

++			 */

13279

++			bfq_idle_extract(st, entity);

13280

++			entity->start = bfq_gt(min_vstart, entity->finish) ?

13281

++				min_vstart : entity->finish;

13282

++		} else {

13283

++			/*

13284

++			 * The finish time of the entity may be invalid, and

13285

++			 * it is in the past for sure, otherwise the queue

13286

++			 * would have been on the idle tree.

13287

++			 */

13288

++			entity->start = min_vstart;

13289

++			st->wsum += entity->weight;

13290

++			bfq_get_entity(entity);

13291

++

13292

++			BUG_ON(entity->on_st);

13293

++			entity->on_st = 1;

13294

++		}

13295

+ 	}

13296

+

13297

+ 	st = __bfq_entity_update_weight_prio(st, entity);

13298

+ 	bfq_calc_finish(entity, entity->budget);

13299

++

13300

++	/*

13301

++	 * If some queues enjoy backshifting for a while, then their

13302

++	 * (virtual) finish timestamps may happen to become lower and

13303

++	 * lower than the system virtual time.  In particular, if

13304

++	 * these queues often happen to be idle for short time

13305

++	 * periods, and during such time periods other queues with

13306

++	 * higher timestamps happen to be busy, then the backshifted

13307

++	 * timestamps of the former queues can become much lower than

13308

++	 * the system virtual time. In fact, to serve the queues with

13309

++	 * higher timestamps while the ones with lower timestamps are

13310

++	 * idle, the system virtual time may be pushed-up to much

13311

++	 * higher values than the finish timestamps of the idle

13312

++	 * queues. As a consequence, the finish timestamps of all new

13313

++	 * or newly activated queues may end up being much larger than

13314

++	 * those of lucky queues with backshifted timestamps. The

13315

++	 * latter queues may then monopolize the device for a lot of

13316

++	 * time. This would simply break service guarantees.

13317

++	 *

13318

++	 * To reduce this problem, push up a little bit the

13319

++	 * backshifted timestamps of the queue associated with this

13320

++	 * entity (only a queue can happen to have the backshifted

13321

++	 * flag set): just enough to let the finish timestamp of the

13322

++	 * queue be equal to the current value of the system virtual

13323

++	 * time. This may introduce a little unfairness among queues

13324

++	 * with backshifted timestamps, but it does not break

13325

++	 * worst-case fairness guarantees.

13326

++	 *

13327

++	 * As a special case, if bfqq is weight-raised, push up

13328

++	 * timestamps much less, to keep very low the probability that

13329

++	 * this push up causes the backshifted finish timestamps of

13330

++	 * weight-raised queues to become higher than the backshifted

13331

++	 * finish timestamps of non weight-raised queues.

13332

++	 */

13333

++	if (backshifted && bfq_gt(st->vtime, entity->finish)) {

13334

++		unsigned long delta = st->vtime - entity->finish;

13335

++

13336

++		if (bfqq)

13337

++			delta /= bfqq->wr_coeff;

13338

++

13339

++		entity->start += delta;

13340

++		entity->finish += delta;

13341

++

13342

++		if (bfqq) {

13343

++			bfq_log_bfqq(bfqq->bfqd, bfqq,

13344

++				     "__activate_entity: new queue finish %llu",

13345

++				     ((entity->finish>>10)*1000)>>12);

13346

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13347

++		} else {

13348

++			struct bfq_group *bfqg =

13349

++				container_of(entity, struct bfq_group, entity);

13350

++

13351

++			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13352

++				     "__activate_entity: new group finish %llu",

13353

++				     ((entity->finish>>10)*1000)>>12);

13354

++#endif

13355

++		}

13356

++	}

13357

++

13358

+ 	bfq_active_insert(st, entity);

13359

++

13360

++	if (bfqq) {

13361

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

13362

++			"__activate_entity: queue %seligible in st %p",

13363

++			     entity->start <= st->vtime ? "" : "non ", st);

13364

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13365

++	} else {

13366

++		struct bfq_group *bfqg =

13367

++			container_of(entity, struct bfq_group, entity);

13368

++

13369

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13370

++			"__activate_entity: group %seligible in st %p",

13371

++			     entity->start <= st->vtime ? "" : "non ", st);

13372

++#endif

13373

++	}

13374

+ }

13375

+

13376

+ /**

13377

+  * bfq_activate_entity - activate an entity and its ancestors if necessary.

13378

+  * @entity: the entity to activate.

13379

++ * @non_blocking_wait_rq: true if this entity was waiting for a request

13380

+  *

13381

+  * Activate @entity and all the entities on the path from it to the root.

13382

+  */

13383

+-static void bfq_activate_entity(struct bfq_entity *entity)

13384

++static void bfq_activate_entity(struct bfq_entity *entity,

13385

++				bool non_blocking_wait_rq)

13386

+ {

13387

+ 	struct bfq_sched_data *sd;

13388

+

13389

+ 	for_each_entity(entity) {

13390

+-		__bfq_activate_entity(entity);

13391

++		BUG_ON(!entity);

13392

++		__bfq_activate_entity(entity, non_blocking_wait_rq);

13393

+

13394

+ 		sd = entity->sched_data;

13395

+ 		if (!bfq_update_next_in_service(sd))

13396

+@@ -890,23 +1048,24 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

13397

+

13398

+ 		if (!__bfq_deactivate_entity(entity, requeue))

13399

+ 			/*

13400

+-			 * The parent entity is still backlogged, and

13401

+-			 * we don't need to update it as it is still

13402

+-			 * in service.

13403

++			 * next_in_service has not been changed, so

13404

++			 * no upwards update is needed

13405

+ 			 */

13406

+ 			break;

13407

+

13408

+ 		if (sd->next_in_service)

13409

+ 			/*

13410

+-			 * The parent entity is still backlogged and

13411

+-			 * the budgets on the path towards the root

13412

+-			 * need to be updated.

13413

++			 * The parent entity is still backlogged,

13414

++			 * because next_in_service is not NULL, and

13415

++			 * next_in_service has been updated (see

13416

++			 * comment on the body of the above if):

13417

++			 * upwards update of the schedule is needed.

13418

+ 			 */

13419

+ 			goto update;

13420

+

13421

+ 		/*

13422

+-		 * If we reach there the parent is no more backlogged and

13423

+-		 * we want to propagate the dequeue upwards.

13424

++		 * If we get here, then the parent is no more backlogged and

13425

++		 * we want to propagate the deactivation upwards.

13426

+ 		 */

13427

+ 		requeue = 1;

13428

+ 	}

13429

+@@ -916,9 +1075,23 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

13430

+ update:

13431

+ 	entity = parent;

13432

+ 	for_each_entity(entity) {

13433

+-		__bfq_activate_entity(entity);

13434

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13435

++		__bfq_activate_entity(entity, false);

13436

+

13437

+ 		sd = entity->sched_data;

13438

++		if (bfqq)

13439

++			bfq_log_bfqq(bfqq->bfqd, bfqq,

13440

++				     "invoking udpdate_next for this queue");

13441

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13442

++		else {

13443

++			struct bfq_group *bfqg =

13444

++				container_of(entity,

13445

++					     struct bfq_group, entity);

13446

++

13447

++			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13448

++				     "invoking udpdate_next for this entity");

13449

++		}

13450

++#endif

13451

+ 		if (!bfq_update_next_in_service(sd))

13452

+ 			break;

13453

+ 	}

13454

+@@ -997,10 +1170,11 @@ left:

13455

+  * Update the virtual time in @st and return the first eligible entity

13456

+  * it contains.

13457

+  */

13458

+-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

13459

+-						   bool force)

13460

++static struct bfq_entity *

13461

++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force)

13462

+ {

13463

+ 	struct bfq_entity *entity, *new_next_in_service = NULL;

13464

++	struct bfq_queue *bfqq;

13465

+

13466

+ 	if (RB_EMPTY_ROOT(&st->active))

13467

+ 		return NULL;

13468

+@@ -1009,6 +1183,24 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

13469

+ 	entity = bfq_first_active_entity(st);

13470

+ 	BUG_ON(bfq_gt(entity->start, st->vtime));

13471

+

13472

++	bfqq = bfq_entity_to_bfqq(entity);

13473

++	if (bfqq)

13474

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

13475

++			     "__lookup_next: start %llu vtime %llu st %p",

13476

++			     ((entity->start>>10)*1000)>>12,

13477

++			     ((st->vtime>>10)*1000)>>12, st);

13478

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13479

++	else {

13480

++		struct bfq_group *bfqg =

13481

++			container_of(entity, struct bfq_group, entity);

13482

++

13483

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

13484

++			     "__lookup_next: start %llu vtime %llu st %p",

13485

++			     ((entity->start>>10)*1000)>>12,

13486

++			     ((st->vtime>>10)*1000)>>12, st);

13487

++	}

13488

++#endif

13489

++

13490

+ 	/*

13491

+ 	 * If the chosen entity does not match with the sched_data's

13492

+ 	 * next_in_service and we are forcedly serving the IDLE priority

13493

+@@ -1045,10 +1237,28 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

13494

+ 	BUG_ON(sd->in_service_entity);

13495

+

13496

+ 	if (bfqd &&

13497

+-	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

13498

++	    jiffies - bfqd->bfq_class_idle_last_service >

13499

++	    BFQ_CL_IDLE_TIMEOUT) {

13500

+ 		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

13501

+ 						  true);

13502

+ 		if (entity) {

13503

++			struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13504

++			if (bfqq)

13505

++				bfq_log_bfqq(bfqd, bfqq,

13506

++					     "idle chosen from st %p %d",

13507

++					     st + BFQ_IOPRIO_CLASSES - 1,

13508

++					BFQ_IOPRIO_CLASSES - 1) ;

13509

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13510

++			else {

13511

++				struct bfq_group *bfqg =

13512

++				container_of(entity, struct bfq_group, entity);

13513

++

13514

++				bfq_log_bfqg(bfqd, bfqg,

13515

++					     "idle chosen from st %p %d",

13516

++					     st + BFQ_IOPRIO_CLASSES - 1,

13517

++					BFQ_IOPRIO_CLASSES - 1) ;

13518

++			}

13519

++#endif

13520

+ 			i = BFQ_IOPRIO_CLASSES - 1;

13521

+ 			bfqd->bfq_class_idle_last_service = jiffies;

13522

+ 			sd->next_in_service = entity;

13523

+@@ -1057,6 +1267,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

13524

+ 	for (; i < BFQ_IOPRIO_CLASSES; i++) {

13525

+ 		entity = __bfq_lookup_next_entity(st + i, false);

13526

+ 		if (entity) {

13527

++			if (bfqd != NULL) {

13528

++			struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

13529

++			if (bfqq)

13530

++				bfq_log_bfqq(bfqd, bfqq,

13531

++					     "chosen from st %p %d",

13532

++					     st + i, i) ;

13533

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13534

++			else {

13535

++				struct bfq_group *bfqg =

13536

++				container_of(entity, struct bfq_group, entity);

13537

++

13538

++				bfq_log_bfqg(bfqd, bfqg,

13539

++					     "chosen from st %p %d",

13540

++					     st + i, i) ;

13541

++			}

13542

++#endif

13543

++			}

13544

++

13545

+ 			if (extract) {

13546

+ 				bfq_check_next_in_service(sd, entity);

13547

+ 				bfq_active_extract(st + i, entity);

13548

+@@ -1070,6 +1298,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

13549

+ 	return entity;

13550

+ }

13551

+

13552

++static bool next_queue_may_preempt(struct bfq_data *bfqd)

13553

++{

13554

++	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;

13555

++

13556

++	return sd->next_in_service != sd->in_service_entity;

13557

++}

13558

++

13559

+ /*

13560

+  * Get next queue for service.

13561

+  */

13562

+@@ -1086,7 +1321,36 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

13563

+

13564

+ 	sd = &bfqd->root_group->sched_data;

13565

+ 	for (; sd ; sd = entity->my_sched_data) {

13566

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13567

++		if (entity) {

13568

++			struct bfq_group *bfqg =

13569

++				container_of(entity, struct bfq_group, entity);

13570

++

13571

++			bfq_log_bfqg(bfqd, bfqg,

13572

++				     "get_next_queue: lookup in this group");

13573

++		} else

13574

++			bfq_log_bfqg(bfqd, bfqd->root_group,

13575

++				     "get_next_queue: lookup in root group");

13576

++#endif

13577

++

13578

+ 		entity = bfq_lookup_next_entity(sd, 1, bfqd);

13579

++

13580

++		bfqq = bfq_entity_to_bfqq(entity);

13581

++		if (bfqq)

13582

++			bfq_log_bfqq(bfqd, bfqq,

13583

++			     "get_next_queue: this queue, finish %llu",

13584

++				(((entity->finish>>10)*1000)>>10)>>2);

13585

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

13586

++		else {

13587

++			struct bfq_group *bfqg =

13588

++				container_of(entity, struct bfq_group, entity);

13589

++

13590

++			bfq_log_bfqg(bfqd, bfqg,

13591

++			     "get_next_queue: this entity, finish %llu",

13592

++				(((entity->finish>>10)*1000)>>10)>>2);

13593

++		}

13594

++#endif

13595

++

13596

+ 		BUG_ON(!entity);

13597

+ 		entity->service = 0;

13598

+ 	}

13599

+@@ -1113,9 +1377,7 @@ static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13600

+ {

13601

+ 	struct bfq_entity *entity = &bfqq->entity;

13602

+

13603

+-	if (bfqq == bfqd->in_service_queue)

13604

+-		__bfq_bfqd_reset_in_service(bfqd);

13605

+-

13606

++	BUG_ON(bfqq == bfqd->in_service_queue);

13607

+ 	bfq_deactivate_entity(entity, requeue);

13608

+ }

13609

+

13610

+@@ -1123,12 +1385,11 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

13611

+ {

13612

+ 	struct bfq_entity *entity = &bfqq->entity;

13613

+

13614

+-	bfq_activate_entity(entity);

13615

++	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));

13616

++	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);

13617

+ }

13618

+

13619

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

13620

+ static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);

13621

+-#endif

13622

+

13623

+ /*

13624

+  * Called when the bfqq no longer has requests pending, remove it from

13625

+@@ -1139,6 +1400,7 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13626

+ {

13627

+ 	BUG_ON(!bfq_bfqq_busy(bfqq));

13628

+ 	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

13629

++	BUG_ON(bfqq == bfqd->in_service_queue);

13630

+

13631

+ 	bfq_log_bfqq(bfqd, bfqq, "del from busy");

13632

+

13633

+@@ -1147,27 +1409,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13634

+ 	BUG_ON(bfqd->busy_queues == 0);

13635

+ 	bfqd->busy_queues--;

13636

+

13637

+-	if (!bfqq->dispatched) {

13638

++	if (!bfqq->dispatched)

13639

+ 		bfq_weights_tree_remove(bfqd, &bfqq->entity,

13640

+ 					&bfqd->queue_weights_tree);

13641

+-		if (!blk_queue_nonrot(bfqd->queue)) {

13642

+-			BUG_ON(!bfqd->busy_in_flight_queues);

13643

+-			bfqd->busy_in_flight_queues--;

13644

+-			if (bfq_bfqq_constantly_seeky(bfqq)) {

13645

+-				BUG_ON(!bfqd->

13646

+-					const_seeky_busy_in_flight_queues);

13647

+-				bfqd->const_seeky_busy_in_flight_queues--;

13648

+-			}

13649

+-		}

13650

+-	}

13651

++

13652

+ 	if (bfqq->wr_coeff > 1)

13653

+ 		bfqd->wr_busy_queues--;

13654

+

13655

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

13656

+ 	bfqg_stats_update_dequeue(bfqq_group(bfqq));

13657

+-#endif

13658

+

13659

++	BUG_ON(bfqq->entity.budget < 0);

13660

++

13661

+ 	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

13662

++

13663

++	BUG_ON(bfqq->entity.budget < 0);

13664

+ }

13665

+

13666

+ /*

13667

+@@ -1185,16 +1440,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

13668

+ 	bfq_mark_bfqq_busy(bfqq);

13669

+ 	bfqd->busy_queues++;

13670

+

13671

+-	if (!bfqq->dispatched) {

13672

++	if (!bfqq->dispatched)

13673

+ 		if (bfqq->wr_coeff == 1)

13674

+ 			bfq_weights_tree_add(bfqd, &bfqq->entity,

13675

+ 					     &bfqd->queue_weights_tree);

13676

+-		if (!blk_queue_nonrot(bfqd->queue)) {

13677

+-			bfqd->busy_in_flight_queues++;

13678

+-			if (bfq_bfqq_constantly_seeky(bfqq))

13679

+-				bfqd->const_seeky_busy_in_flight_queues++;

13680

+-		}

13681

+-	}

13682

++

13683

+ 	if (bfqq->wr_coeff > 1)

13684

+ 		bfqd->wr_busy_queues++;

13685

+ }

13686

+diff --git a/block/bfq.h b/block/bfq.h

13687

+index f73c942..b8ad02a 100644

13688

+--- a/block/bfq.h

13689

++++ b/block/bfq.h

13690

+@@ -1,5 +1,5 @@

13691

+ /*

13692

+- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes.

13693

++ * BFQ-v8 for 4.7.0: data structures and common functions prototypes.

13694

+  *

13695

+  * Based on ideas and code from CFQ:

13696

+  * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

13697

+@@ -28,7 +28,7 @@

13698

+

13699

+ #define BFQ_DEFAULT_QUEUE_IOPRIO	4

13700

+

13701

+-#define BFQ_DEFAULT_GRP_WEIGHT	10

13702

++#define BFQ_WEIGHT_LEGACY_DFL	100

13703

+ #define BFQ_DEFAULT_GRP_IOPRIO	0

13704

+ #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

13705

+

13706

+@@ -36,12 +36,6 @@ struct bfq_entity;

13707

+

13708

+ /**

13709

+  * struct bfq_service_tree - per ioprio_class service tree.

13710

+- * @active: tree for active entities (i.e., those backlogged).

13711

+- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

13712

+- * @first_idle: idle entity with minimum F_i.

13713

+- * @last_idle: idle entity with maximum F_i.

13714

+- * @vtime: scheduler virtual time.

13715

+- * @wsum: scheduler weight sum; active and idle entities contribute to it.

13716

+  *

13717

+  * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

13718

+  * ioprio_class has its own independent scheduler, and so its own

13719

+@@ -49,27 +43,28 @@ struct bfq_entity;

13720

+  * of the containing bfqd.

13721

+  */

13722

+ struct bfq_service_tree {

13723

++	/* tree for active entities (i.e., those backlogged) */

13724

+ 	struct rb_root active;

13725

++	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/

13726

+ 	struct rb_root idle;

13727

+

13728

+-	struct bfq_entity *first_idle;

13729

+-	struct bfq_entity *last_idle;

13730

++	struct bfq_entity *first_idle; 	/* idle entity with minimum F_i */

13731

++	struct bfq_entity *last_idle; 	/* idle entity with maximum F_i */

13732

+

13733

+-	u64 vtime;

13734

++	u64 vtime; /* scheduler virtual time */

13735

++	/* scheduler weight sum; active and idle entities contribute to it */

13736

+ 	unsigned long wsum;

13737

+ };

13738

+

13739

+ /**

13740

+  * struct bfq_sched_data - multi-class scheduler.

13741

+- * @in_service_entity: entity in service.

13742

+- * @next_in_service: head-of-the-line entity in the scheduler.

13743

+- * @service_tree: array of service trees, one per ioprio_class.

13744

+  *

13745

+  * bfq_sched_data is the basic scheduler queue.  It supports three

13746

+- * ioprio_classes, and can be used either as a toplevel queue or as

13747

+- * an intermediate queue on a hierarchical setup.

13748

+- * @next_in_service points to the active entity of the sched_data

13749

+- * service trees that will be scheduled next.

13750

++ * ioprio_classes, and can be used either as a toplevel queue or as an

13751

++ * intermediate queue on a hierarchical setup.  @next_in_service

13752

++ * points to the active entity of the sched_data service trees that

13753

++ * will be scheduled next. It is used to reduce the number of steps

13754

++ * needed for each hierarchical-schedule update.

13755

+  *

13756

+  * The supported ioprio_classes are the same as in CFQ, in descending

13757

+  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

13758

+@@ -79,48 +74,29 @@ struct bfq_service_tree {

13759

+  * All the fields are protected by the queue lock of the containing bfqd.

13760

+  */

13761

+ struct bfq_sched_data {

13762

+-	struct bfq_entity *in_service_entity;

13763

++	struct bfq_entity *in_service_entity;  /* entity in service */

13764

++	/* head-of-the-line entity in the scheduler (see comments above) */

13765

+ 	struct bfq_entity *next_in_service;

13766

++	/* array of service trees, one per ioprio_class */

13767

+ 	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

13768

+ };

13769

+

13770

+ /**

13771

+  * struct bfq_weight_counter - counter of the number of all active entities

13772

+  *                             with a given weight.

13773

+- * @weight: weight of the entities that this counter refers to.

13774

+- * @num_active: number of active entities with this weight.

13775

+- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

13776

+- *                and @group_weights_tree).

13777

+  */

13778

+ struct bfq_weight_counter {

13779

+-	short int weight;

13780

+-	unsigned int num_active;

13781

++	short int weight; /* weight of the entities this counter refers to */

13782

++	unsigned int num_active; /* nr of active entities with this weight */

13783

++	/*

13784

++	 * Weights tree member (see bfq_data's @queue_weights_tree and

13785

++	 * @group_weights_tree)

13786

++	 */

13787

+ 	struct rb_node weights_node;

13788

+ };

13789

+

13790

+ /**

13791

+  * struct bfq_entity - schedulable entity.

13792

+- * @rb_node: service_tree member.

13793

+- * @weight_counter: pointer to the weight counter associated with this entity.

13794

+- * @on_st: flag, true if the entity is on a tree (either the active or

13795

+- *         the idle one of its service_tree).

13796

+- * @finish: B-WF2Q+ finish timestamp (aka F_i).

13797

+- * @start: B-WF2Q+ start timestamp (aka S_i).

13798

+- * @tree: tree the entity is enqueued into; %NULL if not on a tree.

13799

+- * @min_start: minimum start time of the (active) subtree rooted at

13800

+- *             this entity; used for O(log N) lookups into active trees.

13801

+- * @service: service received during the last round of service.

13802

+- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

13803

+- * @weight: weight of the queue

13804

+- * @parent: parent entity, for hierarchical scheduling.

13805

+- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

13806

+- *                 associated scheduler queue, %NULL on leaf nodes.

13807

+- * @sched_data: the scheduler queue this entity belongs to.

13808

+- * @ioprio: the ioprio in use.

13809

+- * @new_weight: when a weight change is requested, the new weight value.

13810

+- * @orig_weight: original weight, used to implement weight boosting

13811

+- * @prio_changed: flag, true when the user requested a weight, ioprio or

13812

+- *		  ioprio_class change.

13813

+  *

13814

+  * A bfq_entity is used to represent either a bfq_queue (leaf node in the

13815

+  * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

13816

+@@ -147,27 +123,52 @@ struct bfq_weight_counter {

13817

+  * containing bfqd.

13818

+  */

13819

+ struct bfq_entity {

13820

+-	struct rb_node rb_node;

13821

++	struct rb_node rb_node; /* service_tree member */

13822

++	/* pointer to the weight counter associated with this entity */

13823

+ 	struct bfq_weight_counter *weight_counter;

13824

+

13825

++	/*

13826

++	 * flag, true if the entity is on a tree (either the active or

13827

++	 * the idle one of its service_tree).

13828

++	 */

13829

+ 	int on_st;

13830

+

13831

+-	u64 finish;

13832

+-	u64 start;

13833

++	u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */

13834

++	u64 start;  /* B-WF2Q+ start timestamp (aka S_i) */

13835

+

13836

++	/* tree the entity is enqueued into; %NULL if not on a tree */

13837

+ 	struct rb_root *tree;

13838

+

13839

++	/*

13840

++	 * minimum start time of the (active) subtree rooted at this

13841

++	 * entity; used for O(log N) lookups into active trees

13842

++	 */

13843

+ 	u64 min_start;

13844

+

13845

+-	int service, budget;

13846

+-	unsigned short weight, new_weight;

13847

++	/* amount of service received during the last service slot */

13848

++	int service;

13849

++

13850

++	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */

13851

++	int budget;

13852

++

13853

++	unsigned short weight; 	/* weight of the queue */

13854

++	unsigned short new_weight; /* next weight if a change is in progress */

13855

++

13856

++	/* original weight, used to implement weight boosting */

13857

+ 	unsigned short orig_weight;

13858

+

13859

++	/* parent entity, for hierarchical scheduling */

13860

+ 	struct bfq_entity *parent;

13861

+

13862

++	/*

13863

++	 * For non-leaf nodes in the hierarchy, the associated

13864

++	 * scheduler queue, %NULL on leaf nodes.

13865

++	 */

13866

+ 	struct bfq_sched_data *my_sched_data;

13867

++	/* the scheduler queue this entity belongs to */

13868

+ 	struct bfq_sched_data *sched_data;

13869

+

13870

++	/* flag, set to request a weight, ioprio or ioprio_class change  */

13871

+ 	int prio_changed;

13872

+ };

13873

+

13874

+@@ -175,56 +176,6 @@ struct bfq_group;

13875

+

13876

+ /**

13877

+  * struct bfq_queue - leaf schedulable entity.

13878

+- * @ref: reference counter.

13879

+- * @bfqd: parent bfq_data.

13880

+- * @new_ioprio: when an ioprio change is requested, the new ioprio value.

13881

+- * @ioprio_class: the ioprio_class in use.

13882

+- * @new_ioprio_class: when an ioprio_class change is requested, the new

13883

+- *                    ioprio_class value.

13884

+- * @new_bfqq: shared bfq_queue if queue is cooperating with

13885

+- *           one or more other queues.

13886

+- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).

13887

+- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).

13888

+- * @sort_list: sorted list of pending requests.

13889

+- * @next_rq: if fifo isn't expired, next request to serve.

13890

+- * @queued: nr of requests queued in @sort_list.

13891

+- * @allocated: currently allocated requests.

13892

+- * @meta_pending: pending metadata requests.

13893

+- * @fifo: fifo list of requests in sort_list.

13894

+- * @entity: entity representing this queue in the scheduler.

13895

+- * @max_budget: maximum budget allowed from the feedback mechanism.

13896

+- * @budget_timeout: budget expiration (in jiffies).

13897

+- * @dispatched: number of requests on the dispatch list or inside driver.

13898

+- * @flags: status flags.

13899

+- * @bfqq_list: node for active/idle bfqq list inside our bfqd.

13900

+- * @burst_list_node: node for the device's burst list.

13901

+- * @seek_samples: number of seeks sampled

13902

+- * @seek_total: sum of the distances of the seeks sampled

13903

+- * @seek_mean: mean seek distance

13904

+- * @last_request_pos: position of the last request enqueued

13905

+- * @requests_within_timer: number of consecutive pairs of request completion

13906

+- *                         and arrival, such that the queue becomes idle

13907

+- *                         after the completion, but the next request arrives

13908

+- *                         within an idle time slice; used only if the queue's

13909

+- *                         IO_bound has been cleared.

13910

+- * @pid: pid of the process owning the queue, used for logging purposes.

13911

+- * @last_wr_start_finish: start time of the current weight-raising period if

13912

+- *                        the @bfq-queue is being weight-raised, otherwise

13913

+- *                        finish time of the last weight-raising period

13914

+- * @wr_cur_max_time: current max raising time for this queue

13915

+- * @soft_rt_next_start: minimum time instant such that, only if a new

13916

+- *                      request is enqueued after this time instant in an

13917

+- *                      idle @bfq_queue with no outstanding requests, then

13918

+- *                      the task associated with the queue it is deemed as

13919

+- *                      soft real-time (see the comments to the function

13920

+- *                      bfq_bfqq_softrt_next_start())

13921

+- * @last_idle_bklogged: time of the last transition of the @bfq_queue from

13922

+- *                      idle to backlogged

13923

+- * @service_from_backlogged: cumulative service received from the @bfq_queue

13924

+- *                           since the last transition from idle to

13925

+- *                           backlogged

13926

+- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

13927

+- *	 queue is shared

13928

+  *

13929

+  * A bfq_queue is a leaf request queue; it can be associated with an

13930

+  * io_context or more, if it  is  async or shared  between  cooperating

13931

+@@ -235,117 +186,163 @@ struct bfq_group;

13932

+  * All the fields are protected by the queue lock of the containing bfqd.

13933

+  */

13934

+ struct bfq_queue {

13935

+-	atomic_t ref;

13936

++	/* reference counter */

13937

++	int ref;

13938

++	/* parent bfq_data */

13939

+ 	struct bfq_data *bfqd;

13940

+

13941

+-	unsigned short ioprio, new_ioprio;

13942

+-	unsigned short ioprio_class, new_ioprio_class;

13943

++	/* current ioprio and ioprio class */

13944

++	unsigned short ioprio, ioprio_class;

13945

++	/* next ioprio and ioprio class if a change is in progress */

13946

++	unsigned short new_ioprio, new_ioprio_class;

13947

+

13948

+-	/* fields for cooperating queues handling */

13949

++	/*

13950

++	 * Shared bfq_queue if queue is cooperating with one or more

13951

++	 * other queues.

13952

++	 */

13953

+ 	struct bfq_queue *new_bfqq;

13954

++	/* request-position tree member (see bfq_group's @rq_pos_tree) */

13955

+ 	struct rb_node pos_node;

13956

++	/* request-position tree root (see bfq_group's @rq_pos_tree) */

13957

+ 	struct rb_root *pos_root;

13958

+

13959

++	/* sorted list of pending requests */

13960

+ 	struct rb_root sort_list;

13961

++	/* if fifo isn't expired, next request to serve */

13962

+ 	struct request *next_rq;

13963

++	/* number of sync and async requests queued */

13964

+ 	int queued[2];

13965

++	/* number of sync and async requests currently allocated */

13966

+ 	int allocated[2];

13967

++	/* number of pending metadata requests */

13968

+ 	int meta_pending;

13969

++	/* fifo list of requests in sort_list */

13970

+ 	struct list_head fifo;

13971

+

13972

++	/* entity representing this queue in the scheduler */

13973

+ 	struct bfq_entity entity;

13974

+

13975

++	/* maximum budget allowed from the feedback mechanism */

13976

+ 	int max_budget;

13977

++	/* budget expiration (in jiffies) */

13978

+ 	unsigned long budget_timeout;

13979

+

13980

++	/* number of requests on the dispatch list or inside driver */

13981

+ 	int dispatched;

13982

+

13983

+-	unsigned int flags;

13984

++	unsigned int flags; /* status flags.*/

13985

+

13986

++	/* node for active/idle bfqq list inside parent bfqd */

13987

+ 	struct list_head bfqq_list;

13988

+

13989

++	/* bit vector: a 1 for each seeky requests in history */

13990

++	u32 seek_history;

13991

++

13992

++	/* node for the device's burst list */

13993

+ 	struct hlist_node burst_list_node;

13994

+

13995

+-	unsigned int seek_samples;

13996

+-	u64 seek_total;

13997

+-	sector_t seek_mean;

13998

++	/* position of the last request enqueued */

13999

+ 	sector_t last_request_pos;

14000

+

14001

++	/* Number of consecutive pairs of request completion and

14002

++	 * arrival, such that the queue becomes idle after the

14003

++	 * completion, but the next request arrives within an idle

14004

++	 * time slice; used only if the queue's IO_bound flag has been

14005

++	 * cleared.

14006

++	 */

14007

+ 	unsigned int requests_within_timer;

14008

+

14009

++	/* pid of the process owning the queue, used for logging purposes */

14010

+ 	pid_t pid;

14011

++

14012

++	/*

14013

++	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL

14014

++	 * if the queue is shared.

14015

++	 */

14016

+ 	struct bfq_io_cq *bic;

14017

+

14018

+-	/* weight-raising fields */

14019

++	/* current maximum weight-raising time for this queue */

14020

+ 	unsigned long wr_cur_max_time;

14021

++	/*

14022

++	 * Minimum time instant such that, only if a new request is

14023

++	 * enqueued after this time instant in an idle @bfq_queue with

14024

++	 * no outstanding requests, then the task associated with the

14025

++	 * queue it is deemed as soft real-time (see the comments on

14026

++	 * the function bfq_bfqq_softrt_next_start())

14027

++	 */

14028

+ 	unsigned long soft_rt_next_start;

14029

++	/*

14030

++	 * Start time of the current weight-raising period if

14031

++	 * the @bfq-queue is being weight-raised, otherwise

14032

++	 * finish time of the last weight-raising period.

14033

++	 */

14034

+ 	unsigned long last_wr_start_finish;

14035

++	/* factor by which the weight of this queue is multiplied */

14036

+ 	unsigned int wr_coeff;

14037

++	/*

14038

++	 * Time of the last transition of the @bfq_queue from idle to

14039

++	 * backlogged.

14040

++	 */

14041

+ 	unsigned long last_idle_bklogged;

14042

++	/*

14043

++	 * Cumulative service received from the @bfq_queue since the

14044

++	 * last transition from idle to backlogged.

14045

++	 */

14046

+ 	unsigned long service_from_backlogged;

14047

++

14048

++	unsigned long split_time; /* time of last split */

14049

+ };

14050

+

14051

+ /**

14052

+  * struct bfq_ttime - per process thinktime stats.

14053

+- * @ttime_total: total process thinktime

14054

+- * @ttime_samples: number of thinktime samples

14055

+- * @ttime_mean: average process thinktime

14056

+  */

14057

+ struct bfq_ttime {

14058

+-	unsigned long last_end_request;

14059

++	unsigned long last_end_request; /* completion time of last request */

14060

++

14061

++	unsigned long ttime_total; /* total process thinktime */

14062

++	unsigned long ttime_samples; /* number of thinktime samples */

14063

++	unsigned long ttime_mean; /* average process thinktime */

14064

+

14065

+-	unsigned long ttime_total;

14066

+-	unsigned long ttime_samples;

14067

+-	unsigned long ttime_mean;

14068

+ };

14069

+

14070

+ /**

14071

+  * struct bfq_io_cq - per (request_queue, io_context) structure.

14072

+- * @icq: associated io_cq structure

14073

+- * @bfqq: array of two process queues, the sync and the async

14074

+- * @ttime: associated @bfq_ttime struct

14075

+- * @ioprio: per (request_queue, blkcg) ioprio.

14076

+- * @blkcg_id: id of the blkcg the related io_cq belongs to.

14077

+- * @wr_time_left: snapshot of the time left before weight raising ends

14078

+- *                for the sync queue associated to this process; this

14079

+- *		  snapshot is taken to remember this value while the weight

14080

+- *		  raising is suspended because the queue is merged with a

14081

+- *		  shared queue, and is used to set @raising_cur_max_time

14082

+- *		  when the queue is split from the shared queue and its

14083

+- *		  weight is raised again

14084

+- * @saved_idle_window: same purpose as the previous field for the idle

14085

+- *                     window

14086

+- * @saved_IO_bound: same purpose as the previous two fields for the I/O

14087

+- *                  bound classification of a queue

14088

+- * @saved_in_large_burst: same purpose as the previous fields for the

14089

+- *                        value of the field keeping the queue's belonging

14090

+- *                        to a large burst

14091

+- * @was_in_burst_list: true if the queue belonged to a burst list

14092

+- *                     before its merge with another cooperating queue

14093

+- * @cooperations: counter of consecutive successful queue merges underwent

14094

+- *                by any of the process' @bfq_queues

14095

+- * @failed_cooperations: counter of consecutive failed queue merges of any

14096

+- *                       of the process' @bfq_queues

14097

+  */

14098

+ struct bfq_io_cq {

14099

++	/* associated io_cq structure */

14100

+ 	struct io_cq icq; /* must be the first member */

14101

++	/* array of two process queues, the sync and the async */

14102

+ 	struct bfq_queue *bfqq[2];

14103

++	/* associated @bfq_ttime struct */

14104

+ 	struct bfq_ttime ttime;

14105

++	/* per (request_queue, blkcg) ioprio */

14106

+ 	int ioprio;

14107

+-

14108

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

14109

+-	uint64_t blkcg_id; /* the current blkcg ID */

14110

++	uint64_t blkcg_serial_nr; /* the current blkcg serial */

14111

+ #endif

14112

+

14113

+-	unsigned int wr_time_left;

14114

++	/*

14115

++	 * Snapshot of the idle window before merging; taken to

14116

++	 * remember this value while the queue is merged, so as to be

14117

++	 * able to restore it in case of split.

14118

++	 */

14119

+ 	bool saved_idle_window;

14120

++	/*

14121

++	 * Same purpose as the previous two fields for the I/O bound

14122

++	 * classification of a queue.

14123

++	 */

14124

+ 	bool saved_IO_bound;

14125

+

14126

++	/*

14127

++	 * Same purpose as the previous fields for the value of the

14128

++	 * field keeping the queue's belonging to a large burst

14129

++	 */

14130

+ 	bool saved_in_large_burst;

14131

++	/*

14132

++	 * True if the queue belonged to a burst list before its merge

14133

++	 * with another cooperating queue.

14134

++	 */

14135

+ 	bool was_in_burst_list;

14136

+-

14137

+-	unsigned int cooperations;

14138

+-	unsigned int failed_cooperations;

14139

+ };

14140

+

14141

+ enum bfq_device_speed {

14142

+@@ -354,224 +351,216 @@ enum bfq_device_speed {

14143

+ };

14144

+

14145

+ /**

14146

+- * struct bfq_data - per device data structure.

14147

+- * @queue: request queue for the managed device.

14148

+- * @root_group: root bfq_group for the device.

14149

+- * @active_numerous_groups: number of bfq_groups containing more than one

14150

+- *                          active @bfq_entity.

14151

+- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

14152

+- *                      weight. Used to keep track of whether all @bfq_queues

14153

+- *                     have the same weight. The tree contains one counter

14154

+- *                     for each distinct weight associated to some active

14155

+- *                     and not weight-raised @bfq_queue (see the comments to

14156

+- *                      the functions bfq_weights_tree_[add|remove] for

14157

+- *                     further details).

14158

+- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

14159

+- *                      by weight. Used to keep track of whether all

14160

+- *                     @bfq_groups have the same weight. The tree contains

14161

+- *                     one counter for each distinct weight associated to

14162

+- *                     some active @bfq_group (see the comments to the

14163

+- *                     functions bfq_weights_tree_[add|remove] for further

14164

+- *                     details).

14165

+- * @busy_queues: number of bfq_queues containing requests (including the

14166

+- *		 queue in service, even if it is idling).

14167

+- * @busy_in_flight_queues: number of @bfq_queues containing pending or

14168

+- *                         in-flight requests, plus the @bfq_queue in

14169

+- *                         service, even if idle but waiting for the

14170

+- *                         possible arrival of its next sync request. This

14171

+- *                         field is updated only if the device is rotational,

14172

+- *                         but used only if the device is also NCQ-capable.

14173

+- *                         The reason why the field is updated also for non-

14174

+- *                         NCQ-capable rotational devices is related to the

14175

+- *                         fact that the value of @hw_tag may be set also

14176

+- *                         later than when busy_in_flight_queues may need to

14177

+- *                         be incremented for the first time(s). Taking also

14178

+- *                         this possibility into account, to avoid unbalanced

14179

+- *                         increments/decrements, would imply more overhead

14180

+- *                         than just updating busy_in_flight_queues

14181

+- *                         regardless of the value of @hw_tag.

14182

+- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

14183

+- *                                     (that is, seeky queues that expired

14184

+- *                                     for budget timeout at least once)

14185

+- *                                     containing pending or in-flight

14186

+- *                                     requests, including the in-service

14187

+- *                                     @bfq_queue if constantly seeky. This

14188

+- *                                     field is updated only if the device

14189

+- *                                     is rotational, but used only if the

14190

+- *                                     device is also NCQ-capable (see the

14191

+- *                                     comments to @busy_in_flight_queues).

14192

+- * @wr_busy_queues: number of weight-raised busy @bfq_queues.

14193

+- * @queued: number of queued requests.

14194

+- * @rq_in_driver: number of requests dispatched and waiting for completion.

14195

+- * @sync_flight: number of sync requests in the driver.

14196

+- * @max_rq_in_driver: max number of reqs in driver in the last

14197

+- *                    @hw_tag_samples completed requests.

14198

+- * @hw_tag_samples: nr of samples used to calculate hw_tag.

14199

+- * @hw_tag: flag set to one if the driver is showing a queueing behavior.

14200

+- * @budgets_assigned: number of budgets assigned.

14201

+- * @idle_slice_timer: timer set when idling for the next sequential request

14202

+- *                    from the queue in service.

14203

+- * @unplug_work: delayed work to restart dispatching on the request queue.

14204

+- * @in_service_queue: bfq_queue in service.

14205

+- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

14206

+- * @last_position: on-disk position of the last served request.

14207

+- * @last_budget_start: beginning of the last budget.

14208

+- * @last_idling_start: beginning of the last idle slice.

14209

+- * @peak_rate: peak transfer rate observed for a budget.

14210

+- * @peak_rate_samples: number of samples used to calculate @peak_rate.

14211

+- * @bfq_max_budget: maximum budget allotted to a bfq_queue before

14212

+- *                  rescheduling.

14213

+- * @active_list: list of all the bfq_queues active on the device.

14214

+- * @idle_list: list of all the bfq_queues idle on the device.

14215

+- * @bfq_fifo_expire: timeout for async/sync requests; when it expires

14216

+- *                   requests are served in fifo order.

14217

+- * @bfq_back_penalty: weight of backward seeks wrt forward ones.

14218

+- * @bfq_back_max: maximum allowed backward seek.

14219

+- * @bfq_slice_idle: maximum idling time.

14220

+- * @bfq_user_max_budget: user-configured max budget value

14221

+- *                       (0 for auto-tuning).

14222

+- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

14223

+- *                           async queues.

14224

+- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

14225

+- *               to prevent seeky queues to impose long latencies to well

14226

+- *               behaved ones (this also implies that seeky queues cannot

14227

+- *               receive guarantees in the service domain; after a timeout

14228

+- *               they are charged for the whole allocated budget, to try

14229

+- *               to preserve a behavior reasonably fair among them, but

14230

+- *               without service-domain guarantees).

14231

+- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

14232

+- *                   no more granted any weight-raising.

14233

+- * @bfq_failed_cooperations: number of consecutive failed cooperation

14234

+- *                           chances after which weight-raising is restored

14235

+- *                           to a queue subject to more than bfq_coop_thresh

14236

+- *                           queue merges.

14237

+- * @bfq_requests_within_timer: number of consecutive requests that must be

14238

+- *                             issued within the idle time slice to set

14239

+- *                             again idling to a queue which was marked as

14240

+- *                             non-I/O-bound (see the definition of the

14241

+- *                             IO_bound flag for further details).

14242

+- * @last_ins_in_burst: last time at which a queue entered the current

14243

+- *                     burst of queues being activated shortly after

14244

+- *                     each other; for more details about this and the

14245

+- *                     following parameters related to a burst of

14246

+- *                     activations, see the comments to the function

14247

+- *                     @bfq_handle_burst.

14248

+- * @bfq_burst_interval: reference time interval used to decide whether a

14249

+- *                      queue has been activated shortly after

14250

+- *                      @last_ins_in_burst.

14251

+- * @burst_size: number of queues in the current burst of queue activations.

14252

+- * @bfq_large_burst_thresh: maximum burst size above which the current

14253

+- * 			    queue-activation burst is deemed as 'large'.

14254

+- * @large_burst: true if a large queue-activation burst is in progress.

14255

+- * @burst_list: head of the burst list (as for the above fields, more details

14256

+- * 		in the comments to the function bfq_handle_burst).

14257

+- * @low_latency: if set to true, low-latency heuristics are enabled.

14258

+- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

14259

+- *                queue is multiplied.

14260

+- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

14261

+- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

14262

+- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

14263

+- *			  may be reactivated for a queue (in jiffies).

14264

+- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

14265

+- *				after which weight-raising may be

14266

+- *				reactivated for an already busy queue

14267

+- *				(in jiffies).

14268

+- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

14269

+- *			    sectors per seconds.

14270

+- * @RT_prod: cached value of the product R*T used for computing the maximum

14271

+- *	     duration of the weight raising automatically.

14272

+- * @device_speed: device-speed class for the low-latency heuristic.

14273

+- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

14274

++ * struct bfq_data - per-device data structure.

14275

+  *

14276

+  * All the fields are protected by the @queue lock.

14277

+  */

14278

+ struct bfq_data {

14279

++	/* request queue for the device */

14280

+ 	struct request_queue *queue;

14281

+

14282

++	/* root bfq_group for the device */

14283

+ 	struct bfq_group *root_group;

14284

+

14285

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

14286

+-	int active_numerous_groups;

14287

+-#endif

14288

+-

14289

++	/*

14290

++	 * rbtree of weight counters of @bfq_queues, sorted by

14291

++	 * weight. Used to keep track of whether all @bfq_queues have

14292

++	 * the same weight. The tree contains one counter for each

14293

++	 * distinct weight associated to some active and not

14294

++	 * weight-raised @bfq_queue (see the comments to the functions

14295

++	 * bfq_weights_tree_[add|remove] for further details).

14296

++	 */

14297

+ 	struct rb_root queue_weights_tree;

14298

++	/*

14299

++	 * rbtree of non-queue @bfq_entity weight counters, sorted by

14300

++	 * weight. Used to keep track of whether all @bfq_groups have

14301

++	 * the same weight. The tree contains one counter for each

14302

++	 * distinct weight associated to some active @bfq_group (see

14303

++	 * the comments to the functions bfq_weights_tree_[add|remove]

14304

++	 * for further details).

14305

++	 */

14306

+ 	struct rb_root group_weights_tree;

14307

+

14308

++	/*

14309

++	 * Number of bfq_queues containing requests (including the

14310

++	 * queue in service, even if it is idling).

14311

++	 */

14312

+ 	int busy_queues;

14313

+-	int busy_in_flight_queues;

14314

+-	int const_seeky_busy_in_flight_queues;

14315

++	/* number of weight-raised busy @bfq_queues */

14316

+ 	int wr_busy_queues;

14317

++	/* number of queued requests */

14318

+ 	int queued;

14319

++	/* number of requests dispatched and waiting for completion */

14320

+ 	int rq_in_driver;

14321

+-	int sync_flight;

14322

+

14323

++	/*

14324

++	 * Maximum number of requests in driver in the last

14325

++	 * @hw_tag_samples completed requests.

14326

++	 */

14327

+ 	int max_rq_in_driver;

14328

++	/* number of samples used to calculate hw_tag */

14329

+ 	int hw_tag_samples;

14330

++	/* flag set to one if the driver is showing a queueing behavior */

14331

+ 	int hw_tag;

14332

+

14333

++	/* number of budgets assigned */

14334

+ 	int budgets_assigned;

14335

+

14336

++	/*

14337

++	 * Timer set when idling (waiting) for the next request from

14338

++	 * the queue in service.

14339

++	 */

14340

+ 	struct timer_list idle_slice_timer;

14341

++	/* delayed work to restart dispatching on the request queue */

14342

+ 	struct work_struct unplug_work;

14343

+

14344

++	/* bfq_queue in service */

14345

+ 	struct bfq_queue *in_service_queue;

14346

++	/* bfq_io_cq (bic) associated with the @in_service_queue */

14347

+ 	struct bfq_io_cq *in_service_bic;

14348

+

14349

++	/* on-disk position of the last served request */

14350

+ 	sector_t last_position;

14351

+

14352

++	/* beginning of the last budget */

14353

+ 	ktime_t last_budget_start;

14354

++	/* beginning of the last idle slice */

14355

+ 	ktime_t last_idling_start;

14356

++	/* number of samples used to calculate @peak_rate */

14357

+ 	int peak_rate_samples;

14358

++	/* peak transfer rate observed for a budget */

14359

+ 	u64 peak_rate;

14360

++	/* maximum budget allotted to a bfq_queue before rescheduling */

14361

+ 	int bfq_max_budget;

14362

+

14363

++	/* list of all the bfq_queues active on the device */

14364

+ 	struct list_head active_list;

14365

++	/* list of all the bfq_queues idle on the device */

14366

+ 	struct list_head idle_list;

14367

+

14368

++	/*

14369

++	 * Timeout for async/sync requests; when it fires, requests

14370

++	 * are served in fifo order.

14371

++	 */

14372

+ 	unsigned int bfq_fifo_expire[2];

14373

++	/* weight of backward seeks wrt forward ones */

14374

+ 	unsigned int bfq_back_penalty;

14375

++	/* maximum allowed backward seek */

14376

+ 	unsigned int bfq_back_max;

14377

++	/* maximum idling time */

14378

+ 	unsigned int bfq_slice_idle;

14379

++	/* last time CLASS_IDLE was served */

14380

+ 	u64 bfq_class_idle_last_service;

14381

+

14382

++	/* user-configured max budget value (0 for auto-tuning) */

14383

+ 	int bfq_user_max_budget;

14384

+-	int bfq_max_budget_async_rq;

14385

+-	unsigned int bfq_timeout[2];

14386

+-

14387

+-	unsigned int bfq_coop_thresh;

14388

+-	unsigned int bfq_failed_cooperations;

14389

++	/*

14390

++	 * Timeout for bfq_queues to consume their budget; used to

14391

++	 * prevent seeky queues from imposing long latencies to

14392

++	 * sequential or quasi-sequential ones (this also implies that

14393

++	 * seeky queues cannot receive guarantees in the service

14394

++	 * domain; after a timeout they are charged for the time they

14395

++	 * have been in service, to preserve fairness among them, but

14396

++	 * without service-domain guarantees).

14397

++	 */

14398

++	unsigned int bfq_timeout;

14399

++

14400

++	/*

14401

++	 * Number of consecutive requests that must be issued within

14402

++	 * the idle time slice to set again idling to a queue which

14403

++	 * was marked as non-I/O-bound (see the definition of the

14404

++	 * IO_bound flag for further details).

14405

++	 */

14406

+ 	unsigned int bfq_requests_within_timer;

14407

+

14408

++	/*

14409

++	 * Force device idling whenever needed to provide accurate

14410

++	 * service guarantees, without caring about throughput

14411

++	 * issues. CAVEAT: this may even increase latencies, in case

14412

++	 * of useless idling for processes that did stop doing I/O.

14413

++	 */

14414

++	bool strict_guarantees;

14415

++

14416

++	/*

14417

++	 * Last time at which a queue entered the current burst of

14418

++	 * queues being activated shortly after each other; for more

14419

++	 * details about this and the following parameters related to

14420

++	 * a burst of activations, see the comments on the function

14421

++	 * bfq_handle_burst.

14422

++	 */

14423

+ 	unsigned long last_ins_in_burst;

14424

++	/*

14425

++	 * Reference time interval used to decide whether a queue has

14426

++	 * been activated shortly after @last_ins_in_burst.

14427

++	 */

14428

+ 	unsigned long bfq_burst_interval;

14429

++	/* number of queues in the current burst of queue activations */

14430

+ 	int burst_size;

14431

++

14432

++	/* common parent entity for the queues in the burst */

14433

++	struct bfq_entity *burst_parent_entity;

14434

++	/* Maximum burst size above which the current queue-activation

14435

++	 * burst is deemed as 'large'.

14436

++	 */

14437

+ 	unsigned long bfq_large_burst_thresh;

14438

++	/* true if a large queue-activation burst is in progress */

14439

+ 	bool large_burst;

14440

++	/*

14441

++	 * Head of the burst list (as for the above fields, more

14442

++	 * details in the comments on the function bfq_handle_burst).

14443

++	 */

14444

+ 	struct hlist_head burst_list;

14445

+

14446

++	/* if set to true, low-latency heuristics are enabled */

14447

+ 	bool low_latency;

14448

+-

14449

+-	/* parameters of the low_latency heuristics */

14450

++	/*

14451

++	 * Maximum factor by which the weight of a weight-raised queue

14452

++	 * is multiplied.

14453

++	 */

14454

+ 	unsigned int bfq_wr_coeff;

14455

++	/* maximum duration of a weight-raising period (jiffies) */

14456

+ 	unsigned int bfq_wr_max_time;

14457

++

14458

++	/* Maximum weight-raising duration for soft real-time processes */

14459

+ 	unsigned int bfq_wr_rt_max_time;

14460

++	/*

14461

++	 * Minimum idle period after which weight-raising may be

14462

++	 * reactivated for a queue (in jiffies).

14463

++	 */

14464

+ 	unsigned int bfq_wr_min_idle_time;

14465

++	/*

14466

++	 * Minimum period between request arrivals after which

14467

++	 * weight-raising may be reactivated for an already busy async

14468

++	 * queue (in jiffies).

14469

++	 */

14470

+ 	unsigned long bfq_wr_min_inter_arr_async;

14471

++

14472

++	/* Max service-rate for a soft real-time queue, in sectors/sec */

14473

+ 	unsigned int bfq_wr_max_softrt_rate;

14474

++	/*

14475

++	 * Cached value of the product R*T, used for computing the

14476

++	 * maximum duration of weight raising automatically.

14477

++	 */

14478

+ 	u64 RT_prod;

14479

++	/* device-speed class for the low-latency heuristic */

14480

+ 	enum bfq_device_speed device_speed;

14481

+

14482

++	/* fallback dummy bfqq for extreme OOM conditions */

14483

+ 	struct bfq_queue oom_bfqq;

14484

+ };

14485

+

14486

+ enum bfqq_state_flags {

14487

+-	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

14488

++	BFQ_BFQQ_FLAG_just_created = 0,	/* queue just allocated */

14489

++	BFQ_BFQQ_FLAG_busy,		/* has requests or is in service */

14490

+ 	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

14491

++	BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*

14492

++					     * waiting for a request

14493

++					     * without idling the device

14494

++					     */

14495

+ 	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

14496

+ 	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

14497

+ 	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

14498

+ 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

14499

+-	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

14500

+ 	BFQ_BFQQ_FLAG_IO_bound,		/*

14501

+ 					 * bfqq has timed-out at least once

14502

+ 					 * having consumed at most 2/10 of

14503

+@@ -581,17 +570,12 @@ enum bfqq_state_flags {

14504

+ 					 * bfqq activated in a large burst,

14505

+ 					 * see comments to bfq_handle_burst.

14506

+ 					 */

14507

+-	BFQ_BFQQ_FLAG_constantly_seeky,	/*

14508

+-					 * bfqq has proved to be slow and

14509

+-					 * seeky until budget timeout

14510

+-					 */

14511

+ 	BFQ_BFQQ_FLAG_softrt_update,	/*

14512

+ 					 * may need softrt-next-start

14513

+ 					 * update

14514

+ 					 */

14515

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

14516

+-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

14517

+-	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

14518

++	BFQ_BFQQ_FLAG_split_coop	/* shared bfqq will be split */

14519

+ };

14520

+

14521

+ #define BFQ_BFQQ_FNS(name)						\

14522

+@@ -608,25 +592,53 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

14523

+ 	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

14524

+ }

14525

+

14526

++BFQ_BFQQ_FNS(just_created);

14527

+ BFQ_BFQQ_FNS(busy);

14528

+ BFQ_BFQQ_FNS(wait_request);

14529

++BFQ_BFQQ_FNS(non_blocking_wait_rq);

14530

+ BFQ_BFQQ_FNS(must_alloc);

14531

+ BFQ_BFQQ_FNS(fifo_expire);

14532

+ BFQ_BFQQ_FNS(idle_window);

14533

+ BFQ_BFQQ_FNS(sync);

14534

+-BFQ_BFQQ_FNS(budget_new);

14535

+ BFQ_BFQQ_FNS(IO_bound);

14536

+ BFQ_BFQQ_FNS(in_large_burst);

14537

+-BFQ_BFQQ_FNS(constantly_seeky);

14538

+ BFQ_BFQQ_FNS(coop);

14539

+ BFQ_BFQQ_FNS(split_coop);

14540

+-BFQ_BFQQ_FNS(just_split);

14541

+ BFQ_BFQQ_FNS(softrt_update);

14542

+ #undef BFQ_BFQQ_FNS

14543

+

14544

+ /* Logging facilities. */

14545

+-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

14546

+-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

14547

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

14548

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);

14549

++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);

14550

++

14551

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\

14552

++	char __pbuf[128];						\

14553

++									\

14554

++	assert_spin_locked((bfqd)->queue->queue_lock);			\

14555

++	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \

14556

++	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \

14557

++			  (bfqq)->pid,			  \

14558

++			  bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\

14559

++			  __pbuf, ##args);				\

14560

++} while (0)

14561

++

14562

++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\

14563

++	char __pbuf[128];						\

14564

++									\

14565

++	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\

14566

++	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\

14567

++} while (0)

14568

++

14569

++#else /* CONFIG_BFQ_GROUP_IOSCHED */

14570

++

14571

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\

14572

++	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\

14573

++			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\

14574

++				##args)

14575

++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)

14576

++

14577

++#endif /* CONFIG_BFQ_GROUP_IOSCHED */

14578

+

14579

+ #define bfq_log(bfqd, fmt, args...) \

14580

+ 	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

14581

+@@ -640,15 +652,12 @@ enum bfqq_expiration {

14582

+ 	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

14583

+ 	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

14584

+ 	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

14585

++	BFQ_BFQQ_PREEMPTED		/* preemption in progress */

14586

+ };

14587

+

14588

+-#ifdef CONFIG_BFQ_GROUP_IOSCHED

14589

+

14590

+ struct bfqg_stats {

14591

+-	/* total bytes transferred */

14592

+-	struct blkg_rwstat		service_bytes;

14593

+-	/* total IOs serviced, post merge */

14594

+-	struct blkg_rwstat		serviced;

14595

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

14596

+ 	/* number of ios merged */

14597

+ 	struct blkg_rwstat		merged;

14598

+ 	/* total time spent on device in ns, may not be accurate w/ queueing */

14599

+@@ -657,12 +666,8 @@ struct bfqg_stats {

14600

+ 	struct blkg_rwstat		wait_time;

14601

+ 	/* number of IOs queued up */

14602

+ 	struct blkg_rwstat		queued;

14603

+-	/* total sectors transferred */

14604

+-	struct blkg_stat		sectors;

14605

+ 	/* total disk time and nr sectors dispatched by this group */

14606

+ 	struct blkg_stat		time;

14607

+-	/* time not charged to this cgroup */

14608

+-	struct blkg_stat		unaccounted_time;

14609

+ 	/* sum of number of ios queued across all samples */

14610

+ 	struct blkg_stat		avg_queue_size_sum;

14611

+ 	/* count of samples taken for average */

14612

+@@ -680,8 +685,10 @@ struct bfqg_stats {

14613

+ 	uint64_t			start_idle_time;

14614

+ 	uint64_t			start_empty_time;

14615

+ 	uint16_t			flags;

14616

++#endif

14617

+ };

14618

+

14619

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

14620

+ /*

14621

+  * struct bfq_group_data - per-blkcg storage for the blkio subsystem.

14622

+  *

14623

+@@ -712,7 +719,7 @@ struct bfq_group_data {

14624

+  *                   unused for the root group. Used to know whether there

14625

+  *                   are groups with more than one active @bfq_entity

14626

+  *                   (see the comments to the function

14627

+- *                   bfq_bfqq_must_not_expire()).

14628

++ *                   bfq_bfqq_may_idle()).

14629

+  * @rq_pos_tree: rbtree sorted by next_request position, used when

14630

+  *               determining if two or more queues have interleaving

14631

+  *               requests (see bfq_find_close_cooperator()).

14632

+@@ -745,7 +752,6 @@ struct bfq_group {

14633

+ 	struct rb_root rq_pos_tree;

14634

+

14635

+ 	struct bfqg_stats stats;

14636

+-	struct bfqg_stats dead_stats;	/* stats pushed from dead children */

14637

+ };

14638

+

14639

+ #else

14640

+@@ -767,11 +773,25 @@ bfq_entity_service_tree(struct bfq_entity *entity)

14641

+ 	struct bfq_sched_data *sched_data = entity->sched_data;

14642

+ 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

14643

+ 	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :

14644

+-				  BFQ_DEFAULT_GRP_CLASS;

14645

++				  BFQ_DEFAULT_GRP_CLASS - 1;

14646

+

14647

+ 	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

14648

+ 	BUG_ON(sched_data == NULL);

14649

+

14650

++	if (bfqq)

14651

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

14652

++			     "entity_service_tree %p %d",

14653

++			     sched_data->service_tree + idx, idx) ;

14654

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

14655

++	else {

14656

++		struct bfq_group *bfqg =

14657

++			container_of(entity, struct bfq_group, entity);

14658

++

14659

++		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,

14660

++			     "entity_service_tree %p %d",

14661

++			     sched_data->service_tree + idx, idx) ;

14662

++	}

14663

++#endif

14664

+ 	return sched_data->service_tree + idx;

14665

+ }

14666

+

14667

+@@ -791,47 +811,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

14668

+ 	return bic->icq.q->elevator->elevator_data;

14669

+ }

14670

+

14671

+-/**

14672

+- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

14673

+- * @ptr: a pointer to a bfqd.

14674

+- * @flags: storage for the flags to be saved.

14675

+- *

14676

+- * This function allows bfqg->bfqd to be protected by the

14677

+- * queue lock of the bfqd they reference; the pointer is dereferenced

14678

+- * under RCU, so the storage for bfqd is assured to be safe as long

14679

+- * as the RCU read side critical section does not end.  After the

14680

+- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

14681

+- * sure that no other writer accessed it.  If we raced with a writer,

14682

+- * the function returns NULL, with the queue unlocked, otherwise it

14683

+- * returns the dereferenced pointer, with the queue locked.

14684

+- */

14685

+-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)

14686

+-{

14687

+-	struct bfq_data *bfqd;

14688

+-

14689

+-	rcu_read_lock();

14690

+-	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

14691

+-

14692

+-	if (bfqd != NULL) {

14693

+-		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

14694

+-		if (ptr == NULL)

14695

+-			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");

14696

+-		else if (*ptr == bfqd)

14697

+-			goto out;

14698

+-		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

14699

+-	}

14700

+-

14701

+-	bfqd = NULL;

14702

+-out:

14703

+-	rcu_read_unlock();

14704

+-	return bfqd;

14705

+-}

14706

+-

14707

+-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)

14708

+-{

14709

+-	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

14710

+-}

14711

+-

14712

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

14713

+

14714

+ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

14715

+@@ -857,11 +836,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);

14716

+ static void bfq_put_queue(struct bfq_queue *bfqq);

14717

+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

14718

+ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

14719

+-				       struct bio *bio, int is_sync,

14720

+-				       struct bfq_io_cq *bic, gfp_t gfp_mask);

14721

++				       struct bio *bio, bool is_sync,

14722

++				       struct bfq_io_cq *bic);

14723

+ static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

14724

+ 				    struct bfq_group *bfqg);

14725

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

14726

+ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

14727

++#endif

14728

+ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

14729

+

14730

+ #endif /* _BFQ_H */

14731

+--

14732

+1.9.1

14733

+

Gentoo Archives: gentoo-commits