[gentoo-commits] linux-patches r2661 - genpatches-2.6/trunk/3.14 - gentoo-commits

From:	"Tom Wijsman (tomwij)" <tomwij@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] linux-patches r2661 - genpatches-2.6/trunk/3.14
Date:	Thu, 30 Jan 2014 16:49:54
Message-Id:	`20140130164948.29F352004C@flycatcher.gentoo.org`

1

Author: tomwij

2

Date: 2014-01-30 16:49:47 +0000 (Thu, 30 Jan 2014)

3

New Revision: 2661

4

5

Added:

6

   genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

7

   genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

8

   genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

9

Removed:

10

   genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

11

   genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

12

   genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

13

   genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

14

Modified:

15

   genpatches-2.6/trunk/3.14/0000_README

16

Log:

17

BFQ v7 3.13.

18

19

Modified: genpatches-2.6/trunk/3.14/0000_README

20

===================================================================

21

--- genpatches-2.6/trunk/3.14/0000_README	2014-01-29 14:41:45 UTC (rev 2660)

22

+++ genpatches-2.6/trunk/3.14/0000_README	2014-01-30 16:49:47 UTC (rev 2661)

23

@@ -83,18 +83,14 @@

24

 From:   Tom Wijsman <TomWij@g.o>

25

 Desc:   Add Gentoo Linux support config settings and defaults.

26

27

-Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

28

+Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch

29

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

-Desc:   BFQ v6r2 patch 1 for 3.11: Build, cgroups and kconfig bits

31

+Desc:   BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits

32

33

-Patch:  5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

34

+Patch:  5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1

35

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

36

-Desc:   BFQ v6r2 patch 2 for 3.10: BFQ Scheduler

37

+Desc:   BFQ v7 patch 2 for 3.13: BFQ Scheduler

38

39

-Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

40

+Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch

41

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

42

-Desc:   BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM)

43

-

44

-Patch:  5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

45

-From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

46

-Desc:   BFQ v6r2 for 3.11.0 to BFQ v6r2 for 3.12.0.

47

+Desc:   BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)

48

\ No newline at end of file

49

50

Added: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

51

===================================================================

52

--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	                        (rev 0)

53

+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	2014-01-30 16:49:47 UTC (rev 2661)

54

@@ -0,0 +1,104 @@

55

+From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001

56

+From: Paolo Valente <paolo.valente@×××××××.it>

57

+Date: Tue, 3 Sep 2013 16:50:42 +0200

58

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13

59

+

60

+Update Kconfig.iosched and do the related Makefile changes to include

61

+kernel configuration options for BFQ. Also add the bfqio controller

62

+to the cgroups subsystem.

63

+

64

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

65

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

66

+---

67

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

68

+ block/Makefile                |  1 +

69

+ include/linux/cgroup_subsys.h |  4 ++++

70

+ 3 files changed, 37 insertions(+)

71

+

72

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

73

+index 421bef9..8f552ba 100644

74

+--- a/block/Kconfig.iosched

75

++++ b/block/Kconfig.iosched

76

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

77

+ 	---help---

78

+ 	  Enable group IO scheduling in CFQ.

79

+

80

++config IOSCHED_BFQ

81

++	tristate "BFQ I/O scheduler"

82

++	default n

83

++	---help---

84

++	  The BFQ I/O scheduler tries to distribute bandwidth among

85

++	  all processes according to their weights.

86

++	  It aims at distributing the bandwidth as desired, independently of

87

++	  the disk parameters and with any workload. It also tries to

88

++	  guarantee low latency to interactive and soft real-time

89

++	  applications.  If compiled built-in (saying Y here), BFQ can

90

++	  be configured to support hierarchical scheduling.

91

++

92

++config CGROUP_BFQIO

93

++	bool "BFQ hierarchical scheduling support"

94

++	depends on CGROUPS && IOSCHED_BFQ=y

95

++	default n

96

++	---help---

97

++	  Enable hierarchical scheduling in BFQ, using the cgroups

98

++	  filesystem interface.  The name of the subsystem will be

99

++	  bfqio.

100

++

101

+ choice

102

+ 	prompt "Default I/O scheduler"

103

+ 	default DEFAULT_CFQ

104

+@@ -52,6 +73,16 @@ choice

105

+ 	config DEFAULT_CFQ

106

+ 		bool "CFQ" if IOSCHED_CFQ=y

107

+

108

++	config DEFAULT_BFQ

109

++		bool "BFQ" if IOSCHED_BFQ=y

110

++		help

111

++		  Selects BFQ as the default I/O scheduler which will be

112

++		  used by default for all block devices.

113

++		  The BFQ I/O scheduler aims at distributing the bandwidth

114

++		  as desired, independently of the disk parameters and with

115

++		  any workload. It also tries to guarantee low latency to

116

++		  interactive and soft real-time applications.

117

++

118

+ 	config DEFAULT_NOOP

119

+ 		bool "No-op"

120

+

121

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

122

+ 	string

123

+ 	default "deadline" if DEFAULT_DEADLINE

124

+ 	default "cfq" if DEFAULT_CFQ

125

++	default "bfq" if DEFAULT_BFQ

126

+ 	default "noop" if DEFAULT_NOOP

127

+

128

+ endmenu

129

+diff --git a/block/Makefile b/block/Makefile

130

+index 20645e8..cbd83fb 100644

131

+--- a/block/Makefile

132

++++ b/block/Makefile

133

+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

134

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

135

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

136

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

137

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

138

+

139

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

140

+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

141

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

142

+index b613ffd..43c5dc9 100644

143

+--- a/include/linux/cgroup_subsys.h

144

++++ b/include/linux/cgroup_subsys.h

145

+@@ -39,6 +39,10 @@ SUBSYS(net_cls)

146

+ SUBSYS(blkio)

147

+ #endif

148

+

149

++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

150

++SUBSYS(bfqio)

151

++#endif

152

++

153

+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

154

+ SUBSYS(perf)

155

+ #endif

156

+--

157

+1.8.5.2

158

+

159

160

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

161

===================================================================

162

--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	2014-01-29 14:41:45 UTC (rev 2660)

163

+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	2014-01-30 16:49:47 UTC (rev 2661)

164

@@ -1,97 +0,0 @@

165

-From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001

166

-From: Arianna Avanzini <avanzini.arianna@×××××.com>

167

-Date: Tue, 3 Sep 2013 16:50:42 +0200

168

-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11

169

-

170

-Update Kconfig.iosched and do the related Makefile changes to include

171

-kernel configuration options for BFQ. Also add the bfqio controller

172

-to the cgroups subsystem.

173

-

174

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

175

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

176

----

177

- block/Kconfig.iosched         | 25 +++++++++++++++++++++++++

178

- block/Makefile                |  1 +

179

- include/linux/cgroup_subsys.h |  4 ++++

180

- 3 files changed, 30 insertions(+)

181

-

182

-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

183

-index 421bef9..695e064 100644

184

---- a/block/Kconfig.iosched

185

-+++ b/block/Kconfig.iosched

186

-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

187

- 	---help---

188

- 	  Enable group IO scheduling in CFQ.

189

-

190

-+config IOSCHED_BFQ

191

-+	tristate "BFQ I/O scheduler"

192

-+	default n

193

-+	---help---

194

-+	  The BFQ I/O scheduler tries to distribute bandwidth among

195

-+	  all processes according to their weights.

196

-+	  It aims at distributing the bandwidth as desired, independently of

197

-+	  the disk parameters and with any workload. It also tries to

198

-+	  guarantee low latency to interactive and soft real-time

199

-+	  applications.  If compiled built-in (saying Y here), BFQ can

200

-+	  be configured to support hierarchical scheduling.

201

-+

202

-+config CGROUP_BFQIO

203

-+	bool "BFQ hierarchical scheduling support"

204

-+	depends on CGROUPS && IOSCHED_BFQ=y

205

-+	default n

206

-+	---help---

207

-+	  Enable hierarchical scheduling in BFQ, using the cgroups

208

-+	  filesystem interface.  The name of the subsystem will be

209

-+	  bfqio.

210

-+

211

- choice

212

- 	prompt "Default I/O scheduler"

213

- 	default DEFAULT_CFQ

214

-@@ -52,6 +73,9 @@ choice

215

- 	config DEFAULT_CFQ

216

- 		bool "CFQ" if IOSCHED_CFQ=y

217

-

218

-+	config DEFAULT_BFQ

219

-+		bool "BFQ" if IOSCHED_BFQ=y

220

-+

221

- 	config DEFAULT_NOOP

222

- 		bool "No-op"

223

-

224

-@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED

225

- 	string

226

- 	default "deadline" if DEFAULT_DEADLINE

227

- 	default "cfq" if DEFAULT_CFQ

228

-+	default "bfq" if DEFAULT_BFQ

229

- 	default "noop" if DEFAULT_NOOP

230

-

231

- endmenu

232

-diff --git a/block/Makefile b/block/Makefile

233

-index 39b76ba..c0d20fa 100644

234

---- a/block/Makefile

235

-+++ b/block/Makefile

236

-@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

237

- obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

238

- obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

239

- obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

240

-+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

241

-

242

- obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

243

- obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

244

-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

245

-index b613ffd..43c5dc9 100644

246

---- a/include/linux/cgroup_subsys.h

247

-+++ b/include/linux/cgroup_subsys.h

248

-@@ -39,6 +39,10 @@ SUBSYS(net_cls)

249

- SUBSYS(blkio)

250

- #endif

251

-

252

-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

253

-+SUBSYS(bfqio)

254

-+#endif

255

-+

256

- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

257

- SUBSYS(perf)

258

- #endif

259

---

260

-1.8.1.4

261

-

262

263

Added: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

264

===================================================================

265

--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	                        (rev 0)

266

+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	2014-01-30 16:49:47 UTC (rev 2661)

267

@@ -0,0 +1,6008 @@

268

+From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001

269

+From: Paolo Valente <paolo.valente@×××××××.it>

270

+Date: Thu, 9 May 2013 19:10:02 +0200

271

+Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13

272

+

273

+Add the BFQ-v7 I/O scheduler to 3.13.

274

+The general structure is borrowed from CFQ, as much of the code for

275

+handling I/O contexts Over time, several useful features have been

276

+ported from CFQ as well (details in the changelog in README.BFQ). A

277

+(bfq_)queue is associated to each task doing I/O on a device, and each

278

+time a scheduling decision has to be made a queue is selected and served

279

+until it expires.

280

+

281

+    - Slices are given in the service domain: tasks are assigned

282

+      budgets, measured in number of sectors. Once got the disk, a task

283

+      must however consume its assigned budget within a configurable

284

+      maximum time (by default, the maximum possible value of the

285

+      budgets is automatically computed to comply with this timeout).

286

+      This allows the desired latency vs "throughput boosting" tradeoff

287

+      to be set.

288

+

289

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

290

+      using an augmented rb-tree to take eligibility into account while

291

+      preserving an O(log N) overall complexity.

292

+

293

+    - A low-latency tunable is provided; if enabled, both interactive

294

+      and soft real-time applications are guaranteed a very low latency.

295

+

296

+    - Latency guarantees are preserved also in the presence of NCQ.

297

+

298

+    - Also with flash-based devices, a high throughput is achieved

299

+      while still preserving latency guarantees.

300

+

301

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

302

+      cooperating-queue-merging and the preemption mechanisms present

303

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

304

+      sequential read pattern, and hence a high throughput, with any

305

+      set of processes performing interleaved I/O over a contiguous

306

+      sequence of sectors.

307

+

308

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

309

+      interface.  Since each node has a full scheduler, each group can

310

+      be assigned its own weight.

311

+

312

+    - If the cgroups interface is not used, only I/O priorities can be

313

+      assigned to processes, with ioprio values mapped to weights

314

+      with the relation weight = IOPRIO_BE_NR - ioprio.

315

+

316

+    - ioprio classes are served in strict priority order, i.e., lower

317

+      priority queues are not served as long as there are higher

318

+      priority queues.  Among queues in the same class the bandwidth is

319

+      distributed in proportion to the weight of each queue. A very

320

+      thin extra bandwidth is however guaranteed to the Idle class, to

321

+      prevent it from starving.

322

+

323

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

324

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

325

+---

326

+ block/bfq-cgroup.c  |  910 ++++++++++++++

327

+ block/bfq-ioc.c     |   36 +

328

+ block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++

329

+ block/bfq-sched.c   | 1077 +++++++++++++++++

330

+ block/bfq.h         |  614 ++++++++++

331

+ 5 files changed, 5905 insertions(+)

332

+ create mode 100644 block/bfq-cgroup.c

333

+ create mode 100644 block/bfq-ioc.c

334

+ create mode 100644 block/bfq-iosched.c

335

+ create mode 100644 block/bfq-sched.c

336

+ create mode 100644 block/bfq.h

337

+

338

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

339

+new file mode 100644

340

+index 0000000..b889acf

341

+--- /dev/null

342

++++ b/block/bfq-cgroup.c

343

+@@ -0,0 +1,910 @@

344

++/*

345

++ * BFQ: CGROUPS support.

346

++ *

347

++ * Based on ideas and code from CFQ:

348

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

349

++ *

350

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

351

++ *		      Paolo Valente <paolo.valente@×××××××.it>

352

++ *

353

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

354

++ *

355

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

356

++ */

357

++

358

++#ifdef CONFIG_CGROUP_BFQIO

359

++

360

++static DEFINE_MUTEX(bfqio_mutex);

361

++

362

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

363

++{

364

++	return bgrp ? !bgrp->online : false;

365

++}

366

++

367

++static struct bfqio_cgroup bfqio_root_cgroup = {

368

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

369

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

370

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

371

++};

372

++

373

++static inline void bfq_init_entity(struct bfq_entity *entity,

374

++				   struct bfq_group *bfqg)

375

++{

376

++	entity->weight = entity->new_weight;

377

++	entity->orig_weight = entity->new_weight;

378

++	entity->ioprio = entity->new_ioprio;

379

++	entity->ioprio_class = entity->new_ioprio_class;

380

++	entity->parent = bfqg->my_entity;

381

++	entity->sched_data = &bfqg->sched_data;

382

++}

383

++

384

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

385

++{

386

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

387

++}

388

++

389

++/*

390

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

391

++ * of bgrp.  Must be called under rcu_read_lock().

392

++ */

393

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

394

++					    struct bfq_data *bfqd)

395

++{

396

++	struct bfq_group *bfqg;

397

++	void *key;

398

++

399

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

400

++		key = rcu_dereference(bfqg->bfqd);

401

++		if (key == bfqd)

402

++			return bfqg;

403

++	}

404

++

405

++	return NULL;

406

++}

407

++

408

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

409

++					 struct bfq_group *bfqg)

410

++{

411

++	struct bfq_entity *entity = &bfqg->entity;

412

++

413

++	/*

414

++	 * If the weight of the entity has never been set via the sysfs

415

++	 * interface, then bgrp->weight == 0. In this case we initialize

416

++	 * the weight from the current ioprio value. Otherwise, the group

417

++	 * weight, if set, has priority over the ioprio value.

418

++	 */

419

++	if (bgrp->weight == 0) {

420

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

421

++		entity->new_ioprio = bgrp->ioprio;

422

++	} else {

423

++		entity->new_weight = bgrp->weight;

424

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

425

++	}

426

++	entity->orig_weight = entity->weight = entity->new_weight;

427

++	entity->ioprio = entity->new_ioprio;

428

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

429

++	entity->my_sched_data = &bfqg->sched_data;

430

++}

431

++

432

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

433

++					struct bfq_group *parent)

434

++{

435

++	struct bfq_entity *entity;

436

++

437

++	BUG_ON(parent == NULL);

438

++	BUG_ON(bfqg == NULL);

439

++

440

++	entity = &bfqg->entity;

441

++	entity->parent = parent->my_entity;

442

++	entity->sched_data = &parent->sched_data;

443

++}

444

++

445

++/**

446

++ * bfq_group_chain_alloc - allocate a chain of groups.

447

++ * @bfqd: queue descriptor.

448

++ * @css: the leaf cgroup_subsys_state this chain starts from.

449

++ *

450

++ * Allocate a chain of groups starting from the one belonging to

451

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

452

++ * to the root has already an allocated group on @bfqd.

453

++ */

454

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

455

++					       struct cgroup_subsys_state *css)

456

++{

457

++	struct bfqio_cgroup *bgrp;

458

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

459

++

460

++	for (; css != NULL; css = css->parent) {

461

++		bgrp = css_to_bfqio(css);

462

++

463

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

464

++		if (bfqg != NULL) {

465

++			/*

466

++			 * All the cgroups in the path from there to the

467

++			 * root must have a bfq_group for bfqd, so we don't

468

++			 * need any more allocations.

469

++			 */

470

++			break;

471

++		}

472

++

473

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

474

++		if (bfqg == NULL)

475

++			goto cleanup;

476

++

477

++		bfq_group_init_entity(bgrp, bfqg);

478

++		bfqg->my_entity = &bfqg->entity;

479

++

480

++		if (leaf == NULL) {

481

++			leaf = bfqg;

482

++			prev = leaf;

483

++		} else {

484

++			bfq_group_set_parent(prev, bfqg);

485

++			/*

486

++			 * Build a list of allocated nodes using the bfqd

487

++			 * filed, that is still unused and will be initialized

488

++			 * only after the node will be connected.

489

++			 */

490

++			prev->bfqd = bfqg;

491

++			prev = bfqg;

492

++		}

493

++	}

494

++

495

++	return leaf;

496

++

497

++cleanup:

498

++	while (leaf != NULL) {

499

++		prev = leaf;

500

++		leaf = leaf->bfqd;

501

++		kfree(prev);

502

++	}

503

++

504

++	return NULL;

505

++}

506

++

507

++/**

508

++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

509

++ * @bfqd: the queue descriptor.

510

++ * @css: the leaf cgroup_subsys_state to start from.

511

++ * @leaf: the leaf group (to be associated to @cgroup).

512

++ *

513

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

514

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

515

++ * hierarchy that already as a group associated to @bfqd all the nodes

516

++ * in the path to the root cgroup have one too.

517

++ *

518

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

519

++ * per device) while the bfqio_cgroup lock protects the list of groups

520

++ * belonging to the same cgroup.

521

++ */

522

++static void bfq_group_chain_link(struct bfq_data *bfqd,

523

++				 struct cgroup_subsys_state *css,

524

++				 struct bfq_group *leaf)

525

++{

526

++	struct bfqio_cgroup *bgrp;

527

++	struct bfq_group *bfqg, *next, *prev = NULL;

528

++	unsigned long flags;

529

++

530

++	assert_spin_locked(bfqd->queue->queue_lock);

531

++

532

++	for (; css != NULL && leaf != NULL; css = css->parent) {

533

++		bgrp = css_to_bfqio(css);

534

++		next = leaf->bfqd;

535

++

536

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

537

++		BUG_ON(bfqg != NULL);

538

++

539

++		spin_lock_irqsave(&bgrp->lock, flags);

540

++

541

++		rcu_assign_pointer(leaf->bfqd, bfqd);

542

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

543

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

544

++

545

++		spin_unlock_irqrestore(&bgrp->lock, flags);

546

++

547

++		prev = leaf;

548

++		leaf = next;

549

++	}

550

++

551

++	BUG_ON(css == NULL && leaf != NULL);

552

++	if (css != NULL && prev != NULL) {

553

++		bgrp = css_to_bfqio(css);

554

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

555

++		bfq_group_set_parent(prev, bfqg);

556

++	}

557

++}

558

++

559

++/**

560

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

561

++ * @bfqd: queue descriptor.

562

++ * @cgroup: cgroup being searched for.

563

++ *

564

++ * Return a group associated to @bfqd in @cgroup, allocating one if

565

++ * necessary.  When a group is returned all the cgroups in the path

566

++ * to the root have a group associated to @bfqd.

567

++ *

568

++ * If the allocation fails, return the root group: this breaks guarantees

569

++ * but is a safe fallbak.  If this loss becames a problem it can be

570

++ * mitigated using the equivalent weight (given by the product of the

571

++ * weights of the groups in the path from @group to the root) in the

572

++ * root scheduler.

573

++ *

574

++ * We allocate all the missing nodes in the path from the leaf cgroup

575

++ * to the root and we connect the nodes only after all the allocations

576

++ * have been successful.

577

++ */

578

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

579

++					      struct cgroup_subsys_state *css)

580

++{

581

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

582

++	struct bfq_group *bfqg;

583

++

584

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

585

++	if (bfqg != NULL)

586

++		return bfqg;

587

++

588

++	bfqg = bfq_group_chain_alloc(bfqd, css);

589

++	if (bfqg != NULL)

590

++		bfq_group_chain_link(bfqd, css, bfqg);

591

++	else

592

++		bfqg = bfqd->root_group;

593

++

594

++	return bfqg;

595

++}

596

++

597

++/**

598

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

599

++ * @bfqd: queue descriptor.

600

++ * @bfqq: the queue to move.

601

++ * @entity: @bfqq's entity.

602

++ * @bfqg: the group to move to.

603

++ *

604

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

605

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

606

++ *

607

++ * Must be called under the queue lock; the cgroup owning @bfqg must

608

++ * not disappear (by now this just means that we are called under

609

++ * rcu_read_lock()).

610

++ */

611

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

612

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

613

++{

614

++	int busy, resume;

615

++

616

++	busy = bfq_bfqq_busy(bfqq);

617

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

618

++

619

++	BUG_ON(resume && !entity->on_st);

620

++	BUG_ON(busy && !resume && entity->on_st &&

621

++	       bfqq != bfqd->in_service_queue);

622

++

623

++	if (busy) {

624

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

625

++

626

++		if (!resume)

627

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

628

++		else

629

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

630

++	} else if (entity->on_st)

631

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

632

++

633

++	/*

634

++	 * Here we use a reference to bfqg.  We don't need a refcounter

635

++	 * as the cgroup reference will not be dropped, so that its

636

++	 * destroy() callback will not be invoked.

637

++	 */

638

++	entity->parent = bfqg->my_entity;

639

++	entity->sched_data = &bfqg->sched_data;

640

++

641

++	if (busy && resume)

642

++		bfq_activate_bfqq(bfqd, bfqq);

643

++

644

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

645

++		bfq_schedule_dispatch(bfqd);

646

++}

647

++

648

++/**

649

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

650

++ * @bfqd: the queue descriptor.

651

++ * @bic: the bic to move.

652

++ * @cgroup: the cgroup to move to.

653

++ *

654

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

655

++ * has to make sure that the reference to cgroup is valid across the call.

656

++ *

657

++ * NOTE: an alternative approach might have been to store the current

658

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

659

++ * time here, at the price of slightly more complex code.

660

++ */

661

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

662

++						struct bfq_io_cq *bic,

663

++						struct cgroup_subsys_state *css)

664

++{

665

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

666

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

667

++	struct bfq_entity *entity;

668

++	struct bfq_group *bfqg;

669

++	struct bfqio_cgroup *bgrp;

670

++

671

++	bgrp = css_to_bfqio(css);

672

++

673

++	bfqg = bfq_find_alloc_group(bfqd, css);

674

++	if (async_bfqq != NULL) {

675

++		entity = &async_bfqq->entity;

676

++

677

++		if (entity->sched_data != &bfqg->sched_data) {

678

++			bic_set_bfqq(bic, NULL, 0);

679

++			bfq_log_bfqq(bfqd, async_bfqq,

680

++				     "bic_change_group: %p %d",

681

++				     async_bfqq, atomic_read(&async_bfqq->ref));

682

++			bfq_put_queue(async_bfqq);

683

++		}

684

++	}

685

++

686

++	if (sync_bfqq != NULL) {

687

++		entity = &sync_bfqq->entity;

688

++		if (entity->sched_data != &bfqg->sched_data)

689

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

690

++	}

691

++

692

++	return bfqg;

693

++}

694

++

695

++/**

696

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

697

++ * @bic: the bic being migrated.

698

++ * @cgroup: the destination cgroup.

699

++ *

700

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

701

++ * moved into its new parent group.

702

++ */

703

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

704

++				  struct cgroup_subsys_state *css)

705

++{

706

++	struct bfq_data *bfqd;

707

++	unsigned long uninitialized_var(flags);

708

++

709

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

710

++				   &flags);

711

++	if (bfqd != NULL) {

712

++		__bfq_bic_change_cgroup(bfqd, bic, css);

713

++		bfq_put_bfqd_unlock(bfqd, &flags);

714

++	}

715

++}

716

++

717

++/**

718

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

719

++ * @bic: the @bic to update.

720

++ *

721

++ * Make sure that @bic is enqueued in the cgroup of the current task.

722

++ * We need this in addition to moving bics during the cgroup attach

723

++ * phase because the task owning @bic could be at its first disk

724

++ * access or we may end up in the root cgroup as the result of a

725

++ * memory allocation failure and here we try to move to the right

726

++ * group.

727

++ *

728

++ * Must be called under the queue lock.  It is safe to use the returned

729

++ * value even after the rcu_read_unlock() as the migration/destruction

730

++ * paths act under the queue lock too.  IOW it is impossible to race with

731

++ * group migration/destruction and end up with an invalid group as:

732

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

733

++ *      has started execution, as current holds a reference to it,

734

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

735

++ *      migrated to a different cgroup] its attach() callback will have

736

++ *      taken care of remove all the references to the old cgroup data.

737

++ */

738

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

739

++{

740

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

741

++	struct bfq_group *bfqg;

742

++	struct cgroup_subsys_state *css;

743

++

744

++	BUG_ON(bfqd == NULL);

745

++

746

++	rcu_read_lock();

747

++	css = task_css(current, bfqio_subsys_id);

748

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

749

++	rcu_read_unlock();

750

++

751

++	return bfqg;

752

++}

753

++

754

++/**

755

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

756

++ * @st: the service tree being flushed.

757

++ */

758

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

759

++{

760

++	struct bfq_entity *entity = st->first_idle;

761

++

762

++	for (; entity != NULL; entity = st->first_idle)

763

++		__bfq_deactivate_entity(entity, 0);

764

++}

765

++

766

++/**

767

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

768

++ * @bfqd: the device data structure with the root group.

769

++ * @entity: the entity to move.

770

++ */

771

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

772

++					    struct bfq_entity *entity)

773

++{

774

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

775

++

776

++	BUG_ON(bfqq == NULL);

777

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

778

++	return;

779

++}

780

++

781

++/**

782

++ * bfq_reparent_active_entities - move to the root group all active entities.

783

++ * @bfqd: the device data structure with the root group.

784

++ * @bfqg: the group to move from.

785

++ * @st: the service tree with the entities.

786

++ *

787

++ * Needs queue_lock to be taken and reference to be valid over the call.

788

++ */

789

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

790

++						struct bfq_group *bfqg,

791

++						struct bfq_service_tree *st)

792

++{

793

++	struct rb_root *active = &st->active;

794

++	struct bfq_entity *entity = NULL;

795

++

796

++	if (!RB_EMPTY_ROOT(&st->active))

797

++		entity = bfq_entity_of(rb_first(active));

798

++

799

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

800

++		bfq_reparent_leaf_entity(bfqd, entity);

801

++

802

++	if (bfqg->sched_data.active_entity != NULL)

803

++		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

804

++

805

++	return;

806

++}

807

++

808

++/**

809

++ * bfq_destroy_group - destroy @bfqg.

810

++ * @bgrp: the bfqio_cgroup containing @bfqg.

811

++ * @bfqg: the group being destroyed.

812

++ *

813

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

814

++ */

815

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

816

++{

817

++	struct bfq_data *bfqd;

818

++	struct bfq_service_tree *st;

819

++	struct bfq_entity *entity = bfqg->my_entity;

820

++	unsigned long uninitialized_var(flags);

821

++	int i;

822

++

823

++	hlist_del(&bfqg->group_node);

824

++

825

++	/*

826

++	 * Empty all service_trees belonging to this group before deactivating

827

++	 * the group itself.

828

++	 */

829

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

830

++		st = bfqg->sched_data.service_tree + i;

831

++

832

++		/*

833

++		 * The idle tree may still contain bfq_queues belonging

834

++		 * to exited task because they never migrated to a different

835

++		 * cgroup from the one being destroyed now.  Noone else

836

++		 * can access them so it's safe to act without any lock.

837

++		 */

838

++		bfq_flush_idle_tree(st);

839

++

840

++		/*

841

++		 * It may happen that some queues are still active

842

++		 * (busy) upon group destruction (if the corresponding

843

++		 * processes have been forced to terminate). We move

844

++		 * all the leaf entities corresponding to these queues

845

++		 * to the root_group.

846

++		 * Also, it may happen that the group has an entity

847

++		 * under service, which is disconnected from the active

848

++		 * tree: it must be moved, too.

849

++		 * There is no need to put the sync queues, as the

850

++		 * scheduler has taken no reference.

851

++		 */

852

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

853

++		if (bfqd != NULL) {

854

++			bfq_reparent_active_entities(bfqd, bfqg, st);

855

++			bfq_put_bfqd_unlock(bfqd, &flags);

856

++		}

857

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

858

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

859

++	}

860

++	BUG_ON(bfqg->sched_data.next_active != NULL);

861

++	BUG_ON(bfqg->sched_data.active_entity != NULL);

862

++

863

++	/*

864

++	 * We may race with device destruction, take extra care when

865

++	 * dereferencing bfqg->bfqd.

866

++	 */

867

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

868

++	if (bfqd != NULL) {

869

++		hlist_del(&bfqg->bfqd_node);

870

++		__bfq_deactivate_entity(entity, 0);

871

++		bfq_put_async_queues(bfqd, bfqg);

872

++		bfq_put_bfqd_unlock(bfqd, &flags);

873

++	}

874

++	BUG_ON(entity->tree != NULL);

875

++

876

++	/*

877

++	 * No need to defer the kfree() to the end of the RCU grace

878

++	 * period: we are called from the destroy() callback of our

879

++	 * cgroup, so we can be sure that noone is a) still using

880

++	 * this cgroup or b) doing lookups in it.

881

++	 */

882

++	kfree(bfqg);

883

++}

884

++

885

++static void bfq_end_raising_async(struct bfq_data *bfqd)

886

++{

887

++	struct hlist_node *tmp;

888

++	struct bfq_group *bfqg;

889

++

890

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

891

++		bfq_end_raising_async_queues(bfqd, bfqg);

892

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

893

++}

894

++

895

++/**

896

++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

897

++ * @bfqd: the device descriptor being exited.

898

++ *

899

++ * When the device exits we just make sure that no lookup can return

900

++ * the now unused group structures.  They will be deallocated on cgroup

901

++ * destruction.

902

++ */

903

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

904

++{

905

++	struct hlist_node *tmp;

906

++	struct bfq_group *bfqg;

907

++

908

++	bfq_log(bfqd, "disconnect_groups beginning");

909

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

910

++		hlist_del(&bfqg->bfqd_node);

911

++

912

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

913

++

914

++		/*

915

++		 * Don't remove from the group hash, just set an

916

++		 * invalid key.  No lookups can race with the

917

++		 * assignment as bfqd is being destroyed; this

918

++		 * implies also that new elements cannot be added

919

++		 * to the list.

920

++		 */

921

++		rcu_assign_pointer(bfqg->bfqd, NULL);

922

++

923

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

924

++			bfqg);

925

++		bfq_put_async_queues(bfqd, bfqg);

926

++	}

927

++}

928

++

929

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

930

++{

931

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

932

++	struct bfq_group *bfqg = bfqd->root_group;

933

++

934

++	bfq_put_async_queues(bfqd, bfqg);

935

++

936

++	spin_lock_irq(&bgrp->lock);

937

++	hlist_del_rcu(&bfqg->group_node);

938

++	spin_unlock_irq(&bgrp->lock);

939

++

940

++	/*

941

++	 * No need to synchronize_rcu() here: since the device is gone

942

++	 * there cannot be any read-side access to its root_group.

943

++	 */

944

++	kfree(bfqg);

945

++}

946

++

947

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

948

++{

949

++	struct bfq_group *bfqg;

950

++	struct bfqio_cgroup *bgrp;

951

++	int i;

952

++

953

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

954

++	if (bfqg == NULL)

955

++		return NULL;

956

++

957

++	bfqg->entity.parent = NULL;

958

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

959

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

960

++

961

++	bgrp = &bfqio_root_cgroup;

962

++	spin_lock_irq(&bgrp->lock);

963

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

964

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

965

++	spin_unlock_irq(&bgrp->lock);

966

++

967

++	return bfqg;

968

++}

969

++

970

++#define SHOW_FUNCTION(__VAR)						\

971

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

972

++				       struct cftype *cftype)		\

973

++{									\

974

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

975

++	u64 ret = -ENODEV;						\

976

++									\

977

++	mutex_lock(&bfqio_mutex);					\

978

++	if (bfqio_is_removed(bgrp))					\

979

++		goto out_unlock;					\

980

++									\

981

++	spin_lock_irq(&bgrp->lock);					\

982

++	ret = bgrp->__VAR;						\

983

++	spin_unlock_irq(&bgrp->lock);					\

984

++									\

985

++out_unlock:								\

986

++	mutex_unlock(&bfqio_mutex);					\

987

++	return ret;							\

988

++}

989

++

990

++SHOW_FUNCTION(weight);

991

++SHOW_FUNCTION(ioprio);

992

++SHOW_FUNCTION(ioprio_class);

993

++#undef SHOW_FUNCTION

994

++

995

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

996

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

997

++					struct cftype *cftype,		\

998

++					u64 val)			\

999

++{									\

1000

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

1001

++	struct bfq_group *bfqg;						\

1002

++	int ret = -EINVAL;						\

1003

++									\

1004

++	if (val < (__MIN) || val > (__MAX))				\

1005

++		return ret;						\

1006

++									\

1007

++	ret = -ENODEV;							\

1008

++	mutex_lock(&bfqio_mutex);					\

1009

++	if (bfqio_is_removed(bgrp))					\

1010

++		goto out_unlock;					\

1011

++	ret = 0;							\

1012

++									\

1013

++	spin_lock_irq(&bgrp->lock);					\

1014

++	bgrp->__VAR = (unsigned short)val;				\

1015

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

1016

++		/*							\

1017

++		 * Setting the ioprio_changed flag of the entity        \

1018

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

1019

++		 * the value of the weight to its ioprio mapping.       \

1020

++		 * Set the flag only if necessary.                      \

1021

++		 */                                                     \

1022

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

1023

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

1024

++			smp_wmb();                                      \

1025

++			bfqg->entity.ioprio_changed = 1;                \

1026

++		}							\

1027

++	}								\

1028

++	spin_unlock_irq(&bgrp->lock);					\

1029

++									\

1030

++out_unlock:								\

1031

++	mutex_unlock(&bfqio_mutex);					\

1032

++	return ret;							\

1033

++}

1034

++

1035

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

1036

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

1037

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

1038

++#undef STORE_FUNCTION

1039

++

1040

++static struct cftype bfqio_files[] = {

1041

++	{

1042

++		.name = "weight",

1043

++		.read_u64 = bfqio_cgroup_weight_read,

1044

++		.write_u64 = bfqio_cgroup_weight_write,

1045

++	},

1046

++	{

1047

++		.name = "ioprio",

1048

++		.read_u64 = bfqio_cgroup_ioprio_read,

1049

++		.write_u64 = bfqio_cgroup_ioprio_write,

1050

++	},

1051

++	{

1052

++		.name = "ioprio_class",

1053

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

1054

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

1055

++	},

1056

++	{ },	/* terminate */

1057

++};

1058

++

1059

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

1060

++						*parent_css)

1061

++{

1062

++	struct bfqio_cgroup *bgrp;

1063

++

1064

++	if (parent_css != NULL) {

1065

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

1066

++		if (bgrp == NULL)

1067

++			return ERR_PTR(-ENOMEM);

1068

++	} else

1069

++		bgrp = &bfqio_root_cgroup;

1070

++

1071

++	spin_lock_init(&bgrp->lock);

1072

++	INIT_HLIST_HEAD(&bgrp->group_data);

1073

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

1074

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

1075

++

1076

++	return &bgrp->css;

1077

++}

1078

++

1079

++/*

1080

++ * We cannot support shared io contexts, as we have no means to support

1081

++ * two tasks with the same ioc in two different groups without major rework

1082

++ * of the main bic/bfqq data structures.  By now we allow a task to change

1083

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

1084

++ * behavior is that a group containing a task that forked using CLONE_IO

1085

++ * will not be destroyed until the tasks sharing the ioc die.

1086

++ */

1087

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

1088

++			    struct cgroup_taskset *tset)

1089

++{

1090

++	struct task_struct *task;

1091

++	struct io_context *ioc;

1092

++	int ret = 0;

1093

++

1094

++	cgroup_taskset_for_each(task, css, tset) {

1095

++		/*

1096

++		 * task_lock() is needed to avoid races with

1097

++		 * exit_io_context()

1098

++		 */

1099

++		task_lock(task);

1100

++		ioc = task->io_context;

1101

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1102

++			/*

1103

++			 * ioc == NULL means that the task is either too young

1104

++			 * or exiting: if it has still no ioc the ioc can't be

1105

++			 * shared, if the task is exiting the attach will fail

1106

++			 * anyway, no matter what we return here.

1107

++			 */

1108

++			ret = -EINVAL;

1109

++		task_unlock(task);

1110

++		if (ret)

1111

++			break;

1112

++	}

1113

++

1114

++	return ret;

1115

++}

1116

++

1117

++static void bfqio_attach(struct cgroup_subsys_state *css,

1118

++			 struct cgroup_taskset *tset)

1119

++{

1120

++	struct task_struct *task;

1121

++	struct io_context *ioc;

1122

++	struct io_cq *icq;

1123

++

1124

++	/*

1125

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

1126

++	 * new group has not yet been tested.

1127

++	 */

1128

++	cgroup_taskset_for_each(task, css, tset) {

1129

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1130

++		if (ioc) {

1131

++			/*

1132

++			 * Handle cgroup change here.

1133

++			 */

1134

++			rcu_read_lock();

1135

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1136

++				if (!strncmp(

1137

++					icq->q->elevator->type->elevator_name,

1138

++					"bfq", ELV_NAME_MAX))

1139

++					bfq_bic_change_cgroup(icq_to_bic(icq),

1140

++							      css);

1141

++			rcu_read_unlock();

1142

++			put_io_context(ioc);

1143

++		}

1144

++	}

1145

++}

1146

++

1147

++static void bfqio_destroy(struct cgroup_subsys_state *css)

1148

++{

1149

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1150

++	struct hlist_node *tmp;

1151

++	struct bfq_group *bfqg;

1152

++

1153

++	/*

1154

++	 * Since we are destroying the cgroup, there are no more tasks

1155

++	 * referencing it, and all the RCU grace periods that may have

1156

++	 * referenced it are ended (as the destruction of the parent

1157

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1158

++	 * anything else and we don't need any synchronization.

1159

++	 */

1160

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1161

++		bfq_destroy_group(bgrp, bfqg);

1162

++

1163

++	BUG_ON(!hlist_empty(&bgrp->group_data));

1164

++

1165

++	kfree(bgrp);

1166

++}

1167

++

1168

++static int bfqio_css_online(struct cgroup_subsys_state *css)

1169

++{

1170

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1171

++

1172

++	mutex_lock(&bfqio_mutex);

1173

++	bgrp->online = true;

1174

++	mutex_unlock(&bfqio_mutex);

1175

++

1176

++	return 0;

1177

++}

1178

++

1179

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

1180

++{

1181

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1182

++

1183

++	mutex_lock(&bfqio_mutex);

1184

++	bgrp->online = false;

1185

++	mutex_unlock(&bfqio_mutex);

1186

++}

1187

++

1188

++struct cgroup_subsys bfqio_subsys = {

1189

++	.name = "bfqio",

1190

++	.css_alloc = bfqio_create,

1191

++	.css_online = bfqio_css_online,

1192

++	.css_offline = bfqio_css_offline,

1193

++	.can_attach = bfqio_can_attach,

1194

++	.attach = bfqio_attach,

1195

++	.css_free = bfqio_destroy,

1196

++	.subsys_id = bfqio_subsys_id,

1197

++	.base_cftypes = bfqio_files,

1198

++};

1199

++#else

1200

++static inline void bfq_init_entity(struct bfq_entity *entity,

1201

++				   struct bfq_group *bfqg)

1202

++{

1203

++	entity->weight = entity->new_weight;

1204

++	entity->orig_weight = entity->new_weight;

1205

++	entity->ioprio = entity->new_ioprio;

1206

++	entity->ioprio_class = entity->new_ioprio_class;

1207

++	entity->sched_data = &bfqg->sched_data;

1208

++}

1209

++

1210

++static inline struct bfq_group *

1211

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1212

++{

1213

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1214

++	return bfqd->root_group;

1215

++}

1216

++

1217

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1218

++				 struct bfq_queue *bfqq,

1219

++				 struct bfq_entity *entity,

1220

++				 struct bfq_group *bfqg)

1221

++{

1222

++}

1223

++

1224

++static void bfq_end_raising_async(struct bfq_data *bfqd)

1225

++{

1226

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

1227

++}

1228

++

1229

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1230

++{

1231

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1232

++}

1233

++

1234

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

1235

++{

1236

++	kfree(bfqd->root_group);

1237

++}

1238

++

1239

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1240

++{

1241

++	struct bfq_group *bfqg;

1242

++	int i;

1243

++

1244

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1245

++	if (bfqg == NULL)

1246

++		return NULL;

1247

++

1248

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1249

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1250

++

1251

++	return bfqg;

1252

++}

1253

++#endif

1254

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1255

+new file mode 100644

1256

+index 0000000..7f6b000

1257

+--- /dev/null

1258

++++ b/block/bfq-ioc.c

1259

+@@ -0,0 +1,36 @@

1260

++/*

1261

++ * BFQ: I/O context handling.

1262

++ *

1263

++ * Based on ideas and code from CFQ:

1264

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1265

++ *

1266

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1267

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1268

++ *

1269

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1270

++ */

1271

++

1272

++/**

1273

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1274

++ * @icq: the iocontext queue.

1275

++ */

1276

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1277

++{

1278

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1279

++	return container_of(icq, struct bfq_io_cq, icq);

1280

++}

1281

++

1282

++/**

1283

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1284

++ * @bfqd: the lookup key.

1285

++ * @ioc: the io_context of the process doing I/O.

1286

++ *

1287

++ * Queue lock must be held.

1288

++ */

1289

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1290

++					       struct io_context *ioc)

1291

++{

1292

++	if (ioc)

1293

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1294

++	return NULL;

1295

++}

1296

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1297

+new file mode 100644

1298

+index 0000000..7670400

1299

+--- /dev/null

1300

++++ b/block/bfq-iosched.c

1301

+@@ -0,0 +1,3268 @@

1302

++/*

1303

++ * BFQ, or Budget Fair Queueing, disk scheduler.

1304

++ *

1305

++ * Based on ideas and code from CFQ:

1306

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1307

++ *

1308

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1309

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1310

++ *

1311

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1312

++ *

1313

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

1314

++ *

1315

++ * BFQ is a proportional share disk scheduling algorithm based on the

1316

++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

1317

++ * number of sectors, to tasks instead of time slices. The disk is not granted

1318

++ * to the in-service task for a given time slice, but until it has exahusted

1319

++ * its assigned budget. This change from the time to the service domain allows

1320

++ * BFQ to distribute the disk bandwidth among tasks as desired, without any

1321

++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

1322

++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

1323

++ * their budgets (more precisely BFQ schedules queues associated to tasks).

1324

++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

1325

++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

1326

++ * latencies to interactive and soft real-time applications.

1327

++ *

1328

++ * BFQ is described in [1], where also a reference to the initial, more

1329

++ * theoretical paper on BFQ can be found. The interested reader can find in

1330

++ * the latter paper full details on the main algorithm as well as formulas of

1331

++ * the guarantees, plus formal proofs of all the properties. With respect to

1332

++ * the version of BFQ presented in these papers, this implementation adds a

1333

++ * few more heuristics, such as the one that guarantees a low latency to soft

1334

++ * real-time applications, and a hierarchical extension based on H-WF2Q+.

1335

++ *

1336

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1337

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1338

++ * complexity derives from the one introduced with EEVDF in [3].

1339

++ *

1340

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1341

++ *     with the BFQ Disk I/O Scheduler'',

1342

++ *     Proceedings of the 5th Annual International Systems and Storage

1343

++ *     Conference (SYSTOR '12), June 2012.

1344

++ *

1345

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1346

++ *

1347

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1348

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1349

++ *     Oct 1997.

1350

++ *

1351

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1352

++ *

1353

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1354

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1355

++ *     Resource Allocation,'' technical report.

1356

++ *

1357

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1358

++ */

1359

++#include <linux/module.h>

1360

++#include <linux/slab.h>

1361

++#include <linux/blkdev.h>

1362

++#include <linux/cgroup.h>

1363

++#include <linux/elevator.h>

1364

++#include <linux/jiffies.h>

1365

++#include <linux/rbtree.h>

1366

++#include <linux/ioprio.h>

1367

++#include "bfq.h"

1368

++#include "blk.h"

1369

++

1370

++/* Max number of dispatches in one round of service. */

1371

++static const int bfq_quantum = 4;

1372

++

1373

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1374

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1375

++

1376

++/* Maximum backwards seek, in KiB. */

1377

++static const int bfq_back_max = 16 * 1024;

1378

++

1379

++/* Penalty of a backwards seek, in number of sectors. */

1380

++static const int bfq_back_penalty = 2;

1381

++

1382

++/* Idling period duration, in jiffies. */

1383

++static int bfq_slice_idle = HZ / 125;

1384

++

1385

++/* Default maximum budget values, in sectors and number of requests. */

1386

++static const int bfq_default_max_budget = 16 * 1024;

1387

++static const int bfq_max_budget_async_rq = 4;

1388

++

1389

++/*

1390

++ * Async to sync throughput distribution is controlled as follows:

1391

++ * when an async request is served, the entity is charged the number

1392

++ * of sectors of the request, multipled by the factor below

1393

++ */

1394

++static const int bfq_async_charge_factor = 10;

1395

++

1396

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1397

++static const int bfq_timeout_sync = HZ / 8;

1398

++static int bfq_timeout_async = HZ / 25;

1399

++

1400

++struct kmem_cache *bfq_pool;

1401

++

1402

++/* Below this threshold (in ms), we consider thinktime immediate. */

1403

++#define BFQ_MIN_TT		2

1404

++

1405

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1406

++#define BFQ_HW_QUEUE_THRESHOLD	4

1407

++#define BFQ_HW_QUEUE_SAMPLES	32

1408

++

1409

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1410

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1411

++

1412

++/* Min samples used for peak rate estimation (for autotuning). */

1413

++#define BFQ_PEAK_RATE_SAMPLES	32

1414

++

1415

++/* Shift used for peak rate fixed precision calculations. */

1416

++#define BFQ_RATE_SHIFT		16

1417

++

1418

++/*

1419

++ * The duration of the weight raising for interactive applications is

1420

++ * computed automatically (as default behaviour), using the following

1421

++ * formula: duration = (R / r) * T, where r is the peak rate of the

1422

++ * disk, and R and T are two reference parameters. In particular, R is

1423

++ * the peak rate of a reference disk, and T is about the maximum time

1424

++ * for starting popular large applications on that disk, under BFQ and

1425

++ * while reading two files in parallel. Finally, BFQ uses two

1426

++ * different pairs (R, T) depending on whether the disk is rotational

1427

++ * or non-rotational.

1428

++ */

1429

++#define T_rot			(msecs_to_jiffies(5500))

1430

++#define T_nonrot		(msecs_to_jiffies(2000))

1431

++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

1432

++#define R_rot			17415

1433

++#define R_nonrot		34791

1434

++

1435

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1436

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1437

++

1438

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1439

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1440

++

1441

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1442

++

1443

++#include "bfq-ioc.c"

1444

++#include "bfq-sched.c"

1445

++#include "bfq-cgroup.c"

1446

++

1447

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1448

++				 IOPRIO_CLASS_IDLE)

1449

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1450

++				 IOPRIO_CLASS_RT)

1451

++

1452

++#define bfq_sample_valid(samples)	((samples) > 80)

1453

++

1454

++/*

1455

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1456

++ * set (in which case it could also be a direct WRITE).

1457

++ */

1458

++static inline int bfq_bio_sync(struct bio *bio)

1459

++{

1460

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1461

++		return 1;

1462

++

1463

++	return 0;

1464

++}

1465

++

1466

++/*

1467

++ * Scheduler run of queue, if there are requests pending and no one in the

1468

++ * driver that will restart queueing.

1469

++ */

1470

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1471

++{

1472

++	if (bfqd->queued != 0) {

1473

++		bfq_log(bfqd, "schedule dispatch");

1474

++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

1475

++	}

1476

++}

1477

++

1478

++/*

1479

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1480

++ * We choose the request that is closesr to the head right now.  Distance

1481

++ * behind the head is penalized and only allowed to a certain extent.

1482

++ */

1483

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1484

++				      struct request *rq1,

1485

++				      struct request *rq2,

1486

++				      sector_t last)

1487

++{

1488

++	sector_t s1, s2, d1 = 0, d2 = 0;

1489

++	unsigned long back_max;

1490

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1491

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1492

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1493

++

1494

++	if (rq1 == NULL || rq1 == rq2)

1495

++		return rq2;

1496

++	if (rq2 == NULL)

1497

++		return rq1;

1498

++

1499

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1500

++		return rq1;

1501

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1502

++		return rq2;

1503

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1504

++		return rq1;

1505

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1506

++		return rq2;

1507

++

1508

++	s1 = blk_rq_pos(rq1);

1509

++	s2 = blk_rq_pos(rq2);

1510

++

1511

++	/*

1512

++	 * By definition, 1KiB is 2 sectors.

1513

++	 */

1514

++	back_max = bfqd->bfq_back_max * 2;

1515

++

1516

++	/*

1517

++	 * Strict one way elevator _except_ in the case where we allow

1518

++	 * short backward seeks which are biased as twice the cost of a

1519

++	 * similar forward seek.

1520

++	 */

1521

++	if (s1 >= last)

1522

++		d1 = s1 - last;

1523

++	else if (s1 + back_max >= last)

1524

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1525

++	else

1526

++		wrap |= BFQ_RQ1_WRAP;

1527

++

1528

++	if (s2 >= last)

1529

++		d2 = s2 - last;

1530

++	else if (s2 + back_max >= last)

1531

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1532

++	else

1533

++		wrap |= BFQ_RQ2_WRAP;

1534

++

1535

++	/* Found required data */

1536

++

1537

++	/*

1538

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1539

++	 * check two variables for all permutations: --> faster!

1540

++	 */

1541

++	switch (wrap) {

1542

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1543

++		if (d1 < d2)

1544

++			return rq1;

1545

++		else if (d2 < d1)

1546

++			return rq2;

1547

++		else {

1548

++			if (s1 >= s2)

1549

++				return rq1;

1550

++			else

1551

++				return rq2;

1552

++		}

1553

++

1554

++	case BFQ_RQ2_WRAP:

1555

++		return rq1;

1556

++	case BFQ_RQ1_WRAP:

1557

++		return rq2;

1558

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1559

++	default:

1560

++		/*

1561

++		 * Since both rqs are wrapped,

1562

++		 * start with the one that's further behind head

1563

++		 * (--> only *one* back seek required),

1564

++		 * since back seek takes more time than forward.

1565

++		 */

1566

++		if (s1 <= s2)

1567

++			return rq1;

1568

++		else

1569

++			return rq2;

1570

++	}

1571

++}

1572

++

1573

++static struct bfq_queue *

1574

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1575

++		     sector_t sector, struct rb_node **ret_parent,

1576

++		     struct rb_node ***rb_link)

1577

++{

1578

++	struct rb_node **p, *parent;

1579

++	struct bfq_queue *bfqq = NULL;

1580

++

1581

++	parent = NULL;

1582

++	p = &root->rb_node;

1583

++	while (*p) {

1584

++		struct rb_node **n;

1585

++

1586

++		parent = *p;

1587

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1588

++

1589

++		/*

1590

++		 * Sort strictly based on sector. Smallest to the left,

1591

++		 * largest to the right.

1592

++		 */

1593

++		if (sector > blk_rq_pos(bfqq->next_rq))

1594

++			n = &(*p)->rb_right;

1595

++		else if (sector < blk_rq_pos(bfqq->next_rq))

1596

++			n = &(*p)->rb_left;

1597

++		else

1598

++			break;

1599

++		p = n;

1600

++		bfqq = NULL;

1601

++	}

1602

++

1603

++	*ret_parent = parent;

1604

++	if (rb_link)

1605

++		*rb_link = p;

1606

++

1607

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1608

++		(long long unsigned)sector,

1609

++		bfqq != NULL ? bfqq->pid : 0);

1610

++

1611

++	return bfqq;

1612

++}

1613

++

1614

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1615

++{

1616

++	struct rb_node **p, *parent;

1617

++	struct bfq_queue *__bfqq;

1618

++

1619

++	if (bfqq->pos_root != NULL) {

1620

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1621

++		bfqq->pos_root = NULL;

1622

++	}

1623

++

1624

++	if (bfq_class_idle(bfqq))

1625

++		return;

1626

++	if (!bfqq->next_rq)

1627

++		return;

1628

++

1629

++	bfqq->pos_root = &bfqd->rq_pos_tree;

1630

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1631

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

1632

++	if (__bfqq == NULL) {

1633

++		rb_link_node(&bfqq->pos_node, parent, p);

1634

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1635

++	} else

1636

++		bfqq->pos_root = NULL;

1637

++}

1638

++

1639

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1640

++					struct bfq_queue *bfqq,

1641

++					struct request *last)

1642

++{

1643

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1644

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1645

++	struct request *next = NULL, *prev = NULL;

1646

++

1647

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1648

++

1649

++	if (rbprev != NULL)

1650

++		prev = rb_entry_rq(rbprev);

1651

++

1652

++	if (rbnext != NULL)

1653

++		next = rb_entry_rq(rbnext);

1654

++	else {

1655

++		rbnext = rb_first(&bfqq->sort_list);

1656

++		if (rbnext && rbnext != &last->rb_node)

1657

++			next = rb_entry_rq(rbnext);

1658

++	}

1659

++

1660

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1661

++}

1662

++

1663

++static void bfq_del_rq_rb(struct request *rq)

1664

++{

1665

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1666

++	struct bfq_data *bfqd = bfqq->bfqd;

1667

++	const int sync = rq_is_sync(rq);

1668

++

1669

++	BUG_ON(bfqq->queued[sync] == 0);

1670

++	bfqq->queued[sync]--;

1671

++	bfqd->queued--;

1672

++

1673

++	elv_rb_del(&bfqq->sort_list, rq);

1674

++

1675

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

1676

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

1677

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

1678

++		/*

1679

++		 * Remove queue from request-position tree as it is empty.

1680

++		 */

1681

++		if (bfqq->pos_root != NULL) {

1682

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

1683

++			bfqq->pos_root = NULL;

1684

++		}

1685

++	}

1686

++}

1687

++

1688

++/* see the definition of bfq_async_charge_factor for details */

1689

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

1690

++					       struct bfq_queue *bfqq)

1691

++{

1692

++	return blk_rq_sectors(rq) *

1693

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

1694

++		bfq_async_charge_factor));

1695

++}

1696

++

1697

++/**

1698

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1699

++ * @bfqd: the device data the queue belongs to.

1700

++ * @bfqq: the queue to update.

1701

++ *

1702

++ * If the first request of a queue changes we make sure that the queue

1703

++ * has enough budget to serve at least its first request (if the

1704

++ * request has grown).  We do this because if the queue has not enough

1705

++ * budget for its first request, it has to go through two dispatch

1706

++ * rounds to actually get it dispatched.

1707

++ */

1708

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1709

++				 struct bfq_queue *bfqq)

1710

++{

1711

++	struct bfq_entity *entity = &bfqq->entity;

1712

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1713

++	struct request *next_rq = bfqq->next_rq;

1714

++	unsigned long new_budget;

1715

++

1716

++	if (next_rq == NULL)

1717

++		return;

1718

++

1719

++	if (bfqq == bfqd->in_service_queue)

1720

++		/*

1721

++		 * In order not to break guarantees, budgets cannot be

1722

++		 * changed after an entity has been selected.

1723

++		 */

1724

++		return;

1725

++

1726

++	BUG_ON(entity->tree != &st->active);

1727

++	BUG_ON(entity == entity->sched_data->active_entity);

1728

++

1729

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1730

++			   bfq_serv_to_charge(next_rq, bfqq));

1731

++	entity->budget = new_budget;

1732

++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

1733

++	bfq_activate_bfqq(bfqd, bfqq);

1734

++}

1735

++

1736

++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

1737

++{

1738

++	u64 dur;

1739

++

1740

++	if (bfqd->bfq_raising_max_time > 0)

1741

++		return bfqd->bfq_raising_max_time;

1742

++

1743

++	dur = bfqd->RT_prod;

1744

++	do_div(dur, bfqd->peak_rate);

1745

++

1746

++	return dur;

1747

++}

1748

++

1749

++static void bfq_add_rq_rb(struct request *rq)

1750

++{

1751

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1752

++	struct bfq_entity *entity = &bfqq->entity;

1753

++	struct bfq_data *bfqd = bfqq->bfqd;

1754

++	struct request *next_rq, *prev;

1755

++	unsigned long old_raising_coeff = bfqq->raising_coeff;

1756

++	int idle_for_long_time = 0;

1757

++

1758

++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

1759

++	bfqq->queued[rq_is_sync(rq)]++;

1760

++	bfqd->queued++;

1761

++

1762

++	elv_rb_add(&bfqq->sort_list, rq);

1763

++

1764

++	/*

1765

++	 * Check if this request is a better next-serve candidate.

1766

++	 */

1767

++	prev = bfqq->next_rq;

1768

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

1769

++	BUG_ON(next_rq == NULL);

1770

++	bfqq->next_rq = next_rq;

1771

++

1772

++	/*

1773

++	 * Adjust priority tree position, if next_rq changes.

1774

++	 */

1775

++	if (prev != bfqq->next_rq)

1776

++		bfq_rq_pos_tree_add(bfqd, bfqq);

1777

++

1778

++	if (!bfq_bfqq_busy(bfqq)) {

1779

++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

1780

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

1781

++		idle_for_long_time = time_is_before_jiffies(

1782

++			bfqq->budget_timeout +

1783

++			bfqd->bfq_raising_min_idle_time);

1784

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

1785

++				       bfq_serv_to_charge(next_rq, bfqq));

1786

++

1787

++		if (!bfqd->low_latency)

1788

++			goto add_bfqq_busy;

1789

++

1790

++		/*

1791

++		 * If the queue is not being boosted and has been idle

1792

++		 * for enough time, start a weight-raising period

1793

++		 */

1794

++		if (old_raising_coeff == 1 &&

1795

++		    (idle_for_long_time || soft_rt)) {

1796

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1797

++			if (idle_for_long_time)

1798

++				bfqq->raising_cur_max_time =

1799

++					bfq_wrais_duration(bfqd);

1800

++			else

1801

++				bfqq->raising_cur_max_time =

1802

++					bfqd->bfq_raising_rt_max_time;

1803

++			bfq_log_bfqq(bfqd, bfqq,

1804

++				     "wrais starting at %llu msec,"

1805

++				     "rais_max_time %u",

1806

++				     bfqq->last_rais_start_finish,

1807

++				     jiffies_to_msecs(bfqq->

1808

++					raising_cur_max_time));

1809

++		} else if (old_raising_coeff > 1) {

1810

++			if (idle_for_long_time)

1811

++				bfqq->raising_cur_max_time =

1812

++					bfq_wrais_duration(bfqd);

1813

++			else if (bfqq->raising_cur_max_time ==

1814

++				 bfqd->bfq_raising_rt_max_time &&

1815

++				 !soft_rt) {

1816

++				bfqq->raising_coeff = 1;

1817

++				bfq_log_bfqq(bfqd, bfqq,

1818

++					     "wrais ending at %llu msec,"

1819

++					     "rais_max_time %u",

1820

++					     bfqq->last_rais_start_finish,

1821

++					     jiffies_to_msecs(bfqq->

1822

++						raising_cur_max_time));

1823

++			} else if ((bfqq->last_rais_start_finish +

1824

++				    bfqq->raising_cur_max_time <

1825

++				    jiffies + bfqd->bfq_raising_rt_max_time) &&

1826

++				   soft_rt) {

1827

++				/*

1828

++				 *

1829

++				 * The remaining weight-raising time is lower

1830

++				 * than bfqd->bfq_raising_rt_max_time, which

1831

++				 * means that the application is enjoying

1832

++				 * weight raising either because deemed soft rt

1833

++				 * in the near past, or because deemed

1834

++				 * interactive a long ago. In both cases,

1835

++				 * resetting now the current remaining weight-

1836

++				 * raising time for the application to the

1837

++				 * weight-raising duration for soft rt

1838

++				 * applications would not cause any latency

1839

++				 * increase for the application (as the new

1840

++				 * duration would be higher than the remaining

1841

++				 * time).

1842

++				 *

1843

++				 * In addition, the application is now meeting

1844

++				 * the requirements for being deemed soft rt.

1845

++				 * In the end we can correctly and safely

1846

++				 * (re)charge the weight-raising duration for

1847

++				 * the application with the weight-raising

1848

++				 * duration for soft rt applications.

1849

++				 *

1850

++				 * In particular, doing this recharge now, i.e.,

1851

++				 * before the weight-raising period for the

1852

++				 * application finishes, reduces the probability

1853

++				 * of the following negative scenario:

1854

++				 * 1) the weight of a soft rt application is

1855

++				 *    raised at startup (as for any newly

1856

++				 *    created application),

1857

++				 * 2) since the application is not interactive,

1858

++				 *    at a certain time weight-raising is

1859

++				 *    stopped for the application,

1860

++				 * 3) at that time the application happens to

1861

++				 *    still have pending requests, and hence

1862

++				 *    is destined to not have a chance to be

1863

++				 *    deemed soft rt before these requests are

1864

++				 *    completed (see the comments to the

1865

++				 *    function bfq_bfqq_softrt_next_start()

1866

++				 *    for details on soft rt detection),

1867

++				 * 4) these pending requests experience a high

1868

++				 *    latency because the application is not

1869

++				 *    weight-raised while they are pending.

1870

++				 */

1871

++				bfqq->last_rais_start_finish = jiffies;

1872

++				bfqq->raising_cur_max_time =

1873

++					bfqd->bfq_raising_rt_max_time;

1874

++			}

1875

++		}

1876

++		if (old_raising_coeff != bfqq->raising_coeff)

1877

++			entity->ioprio_changed = 1;

1878

++add_bfqq_busy:

1879

++		bfqq->last_idle_bklogged = jiffies;

1880

++		bfqq->service_from_backlogged = 0;

1881

++		bfq_clear_bfqq_softrt_update(bfqq);

1882

++		bfq_add_bfqq_busy(bfqd, bfqq);

1883

++	} else {

1884

++		if (bfqd->low_latency && old_raising_coeff == 1 &&

1885

++			!rq_is_sync(rq) &&

1886

++			bfqq->last_rais_start_finish +

1887

++			time_is_before_jiffies(

1888

++				bfqd->bfq_raising_min_inter_arr_async)) {

1889

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1890

++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

1891

++

1892

++			bfqd->raised_busy_queues++;

1893

++			entity->ioprio_changed = 1;

1894

++			bfq_log_bfqq(bfqd, bfqq,

1895

++				     "non-idle wrais starting at %llu msec,"

1896

++				     "rais_max_time %u",

1897

++				     bfqq->last_rais_start_finish,

1898

++				     jiffies_to_msecs(bfqq->

1899

++					raising_cur_max_time));

1900

++		}

1901

++		bfq_updated_next_req(bfqd, bfqq);

1902

++	}

1903

++

1904

++	if (bfqd->low_latency &&

1905

++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

1906

++		 idle_for_long_time))

1907

++		bfqq->last_rais_start_finish = jiffies;

1908

++}

1909

++

1910

++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

1911

++{

1912

++	elv_rb_del(&bfqq->sort_list, rq);

1913

++	bfqq->queued[rq_is_sync(rq)]--;

1914

++	bfqq->bfqd->queued--;

1915

++	bfq_add_rq_rb(rq);

1916

++}

1917

++

1918

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

1919

++					  struct bio *bio)

1920

++{

1921

++	struct task_struct *tsk = current;

1922

++	struct bfq_io_cq *bic;

1923

++	struct bfq_queue *bfqq;

1924

++

1925

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

1926

++	if (bic == NULL)

1927

++		return NULL;

1928

++

1929

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1930

++	if (bfqq != NULL)

1931

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

1932

++

1933

++	return NULL;

1934

++}

1935

++

1936

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

1937

++{

1938

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1939

++

1940

++	bfqd->rq_in_driver++;

1941

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1942

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

1943

++		(long long unsigned)bfqd->last_position);

1944

++}

1945

++

1946

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

1947

++{

1948

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1949

++

1950

++	WARN_ON(bfqd->rq_in_driver == 0);

1951

++	bfqd->rq_in_driver--;

1952

++}

1953

++

1954

++static void bfq_remove_request(struct request *rq)

1955

++{

1956

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1957

++	struct bfq_data *bfqd = bfqq->bfqd;

1958

++

1959

++	if (bfqq->next_rq == rq) {

1960

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

1961

++		bfq_updated_next_req(bfqd, bfqq);

1962

++	}

1963

++

1964

++	list_del_init(&rq->queuelist);

1965

++	bfq_del_rq_rb(rq);

1966

++

1967

++	if (rq->cmd_flags & REQ_META) {

1968

++		WARN_ON(bfqq->meta_pending == 0);

1969

++		bfqq->meta_pending--;

1970

++	}

1971

++}

1972

++

1973

++static int bfq_merge(struct request_queue *q, struct request **req,

1974

++		     struct bio *bio)

1975

++{

1976

++	struct bfq_data *bfqd = q->elevator->elevator_data;

1977

++	struct request *__rq;

1978

++

1979

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

1980

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

1981

++		*req = __rq;

1982

++		return ELEVATOR_FRONT_MERGE;

1983

++	}

1984

++

1985

++	return ELEVATOR_NO_MERGE;

1986

++}

1987

++

1988

++static void bfq_merged_request(struct request_queue *q, struct request *req,

1989

++			       int type)

1990

++{

1991

++	if (type == ELEVATOR_FRONT_MERGE) {

1992

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

1993

++

1994

++		bfq_reposition_rq_rb(bfqq, req);

1995

++	}

1996

++}

1997

++

1998

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

1999

++				struct request *next)

2000

++{

2001

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2002

++

2003

++	/*

2004

++	 * Reposition in fifo if next is older than rq.

2005

++	 */

2006

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2007

++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

2008

++		list_move(&rq->queuelist, &next->queuelist);

2009

++		rq_set_fifo_time(rq, rq_fifo_time(next));

2010

++	}

2011

++

2012

++	if (bfqq->next_rq == next)

2013

++		bfqq->next_rq = rq;

2014

++

2015

++	bfq_remove_request(next);

2016

++}

2017

++

2018

++/* Must be called with bfqq != NULL */

2019

++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

2020

++{

2021

++	BUG_ON(bfqq == NULL);

2022

++	if (bfq_bfqq_busy(bfqq))

2023

++		bfqq->bfqd->raised_busy_queues--;

2024

++	bfqq->raising_coeff = 1;

2025

++	bfqq->raising_cur_max_time = 0;

2026

++	/* Trigger a weight change on the next activation of the queue */

2027

++	bfqq->entity.ioprio_changed = 1;

2028

++}

2029

++

2030

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

2031

++					struct bfq_group *bfqg)

2032

++{

2033

++	int i, j;

2034

++

2035

++	for (i = 0; i < 2; i++)

2036

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2037

++			if (bfqg->async_bfqq[i][j] != NULL)

2038

++				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

2039

++	if (bfqg->async_idle_bfqq != NULL)

2040

++		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

2041

++}

2042

++

2043

++static void bfq_end_raising(struct bfq_data *bfqd)

2044

++{

2045

++	struct bfq_queue *bfqq;

2046

++

2047

++	spin_lock_irq(bfqd->queue->queue_lock);

2048

++

2049

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2050

++		bfq_bfqq_end_raising(bfqq);

2051

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2052

++		bfq_bfqq_end_raising(bfqq);

2053

++	bfq_end_raising_async(bfqd);

2054

++

2055

++	spin_unlock_irq(bfqd->queue->queue_lock);

2056

++}

2057

++

2058

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2059

++			   struct bio *bio)

2060

++{

2061

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2062

++	struct bfq_io_cq *bic;

2063

++	struct bfq_queue *bfqq;

2064

++

2065

++	/*

2066

++	 * Disallow merge of a sync bio into an async request.

2067

++	 */

2068

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2069

++		return 0;

2070

++

2071

++	/*

2072

++	 * Lookup the bfqq that this bio will be queued with. Allow

2073

++	 * merge only if rq is queued there.

2074

++	 * Queue lock is held here.

2075

++	 */

2076

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2077

++	if (bic == NULL)

2078

++		return 0;

2079

++

2080

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2081

++	return bfqq == RQ_BFQQ(rq);

2082

++}

2083

++

2084

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2085

++				       struct bfq_queue *bfqq)

2086

++{

2087

++	if (bfqq != NULL) {

2088

++		bfq_mark_bfqq_must_alloc(bfqq);

2089

++		bfq_mark_bfqq_budget_new(bfqq);

2090

++		bfq_clear_bfqq_fifo_expire(bfqq);

2091

++

2092

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2093

++

2094

++		bfq_log_bfqq(bfqd, bfqq,

2095

++			     "set_in_service_queue, cur-budget = %lu",

2096

++			     bfqq->entity.budget);

2097

++	}

2098

++

2099

++	bfqd->in_service_queue = bfqq;

2100

++}

2101

++

2102

++/*

2103

++ * Get and set a new queue for service.

2104

++ */

2105

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2106

++						  struct bfq_queue *bfqq)

2107

++{

2108

++	if (!bfqq)

2109

++		bfqq = bfq_get_next_queue(bfqd);

2110

++	else

2111

++		bfq_get_next_queue_forced(bfqd, bfqq);

2112

++

2113

++	__bfq_set_in_service_queue(bfqd, bfqq);

2114

++	return bfqq;

2115

++}

2116

++

2117

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2118

++					  struct request *rq)

2119

++{

2120

++	if (blk_rq_pos(rq) >= bfqd->last_position)

2121

++		return blk_rq_pos(rq) - bfqd->last_position;

2122

++	else

2123

++		return bfqd->last_position - blk_rq_pos(rq);

2124

++}

2125

++

2126

++/*

2127

++ * Return true if bfqq has no request pending and rq is close enough to

2128

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2129

++ * bfqq->next_rq

2130

++ */

2131

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2132

++{

2133

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2134

++}

2135

++

2136

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2137

++{

2138

++	struct rb_root *root = &bfqd->rq_pos_tree;

2139

++	struct rb_node *parent, *node;

2140

++	struct bfq_queue *__bfqq;

2141

++	sector_t sector = bfqd->last_position;

2142

++

2143

++	if (RB_EMPTY_ROOT(root))

2144

++		return NULL;

2145

++

2146

++	/*

2147

++	 * First, if we find a request starting at the end of the last

2148

++	 * request, choose it.

2149

++	 */

2150

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2151

++	if (__bfqq != NULL)

2152

++		return __bfqq;

2153

++

2154

++	/*

2155

++	 * If the exact sector wasn't found, the parent of the NULL leaf

2156

++	 * will contain the closest sector (rq_pos_tree sorted by next_request

2157

++	 * position).

2158

++	 */

2159

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2160

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2161

++		return __bfqq;

2162

++

2163

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

2164

++		node = rb_next(&__bfqq->pos_node);

2165

++	else

2166

++		node = rb_prev(&__bfqq->pos_node);

2167

++	if (node == NULL)

2168

++		return NULL;

2169

++

2170

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2171

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2172

++		return __bfqq;

2173

++

2174

++	return NULL;

2175

++}

2176

++

2177

++/*

2178

++ * bfqd - obvious

2179

++ * cur_bfqq - passed in so that we don't decide that the current queue

2180

++ *            is closely cooperating with itself.

2181

++ *

2182

++ * We are assuming that cur_bfqq has dispatched at least one request,

2183

++ * and that bfqd->last_position reflects a position on the disk associated

2184

++ * with the I/O issued by cur_bfqq.

2185

++ */

2186

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2187

++					      struct bfq_queue *cur_bfqq)

2188

++{

2189

++	struct bfq_queue *bfqq;

2190

++

2191

++	if (bfq_class_idle(cur_bfqq))

2192

++		return NULL;

2193

++	if (!bfq_bfqq_sync(cur_bfqq))

2194

++		return NULL;

2195

++	if (BFQQ_SEEKY(cur_bfqq))

2196

++		return NULL;

2197

++

2198

++	/* If device has only one backlogged bfq_queue, don't search. */

2199

++	if (bfqd->busy_queues == 1)

2200

++		return NULL;

2201

++

2202

++	/*

2203

++	 * We should notice if some of the queues are cooperating, e.g.

2204

++	 * working closely on the same area of the disk. In that case,

2205

++	 * we can group them together and don't waste time idling.

2206

++	 */

2207

++	bfqq = bfqq_close(bfqd);

2208

++	if (bfqq == NULL || bfqq == cur_bfqq)

2209

++		return NULL;

2210

++

2211

++	/*

2212

++	 * Do not merge queues from different bfq_groups.

2213

++	*/

2214

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2215

++		return NULL;

2216

++

2217

++	/*

2218

++	 * It only makes sense to merge sync queues.

2219

++	 */

2220

++	if (!bfq_bfqq_sync(bfqq))

2221

++		return NULL;

2222

++	if (BFQQ_SEEKY(bfqq))

2223

++		return NULL;

2224

++

2225

++	/*

2226

++	 * Do not merge queues of different priority classes.

2227

++	 */

2228

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2229

++		return NULL;

2230

++

2231

++	return bfqq;

2232

++}

2233

++

2234

++/*

2235

++ * If enough samples have been computed, return the current max budget

2236

++ * stored in bfqd, which is dynamically updated according to the

2237

++ * estimated disk peak rate; otherwise return the default max budget

2238

++ */

2239

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2240

++{

2241

++	if (bfqd->budgets_assigned < 194)

2242

++		return bfq_default_max_budget;

2243

++	else

2244

++		return bfqd->bfq_max_budget;

2245

++}

2246

++

2247

++/*

2248

++ * Return min budget, which is a fraction of the current or default

2249

++ * max budget (trying with 1/32)

2250

++ */

2251

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2252

++{

2253

++	if (bfqd->budgets_assigned < 194)

2254

++		return bfq_default_max_budget / 32;

2255

++	else

2256

++		return bfqd->bfq_max_budget / 32;

2257

++}

2258

++

2259

++/*

2260

++ * Decides whether idling should be done for given device and

2261

++ * given in-service queue.

2262

++ */

2263

++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

2264

++					   struct bfq_queue *in_service_bfqq)

2265

++{

2266

++	if (in_service_bfqq == NULL)

2267

++		return false;

2268

++	/*

2269

++	 * If device is SSD it has no seek penalty, disable idling; but

2270

++	 * do so only if:

2271

++	 * - device does not support queuing, otherwise we still have

2272

++	 *   a problem with sync vs async workloads;

2273

++	 * - the queue is not weight-raised, to preserve guarantees.

2274

++	 */

2275

++	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

2276

++		in_service_bfqq->raising_coeff == 1);

2277

++}

2278

++

2279

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2280

++{

2281

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2282

++	struct bfq_io_cq *bic;

2283

++	unsigned long sl;

2284

++

2285

++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2286

++

2287

++	/* Tasks have exited, don't wait. */

2288

++	bic = bfqd->in_service_bic;

2289

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2290

++		return;

2291

++

2292

++	bfq_mark_bfqq_wait_request(bfqq);

2293

++

2294

++	/*

2295

++	 * We don't want to idle for seeks, but we do want to allow

2296

++	 * fair distribution of slice time for a process doing back-to-back

2297

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2298

++	 *

2299

++	 * To prevent processes with (partly) seeky workloads from

2300

++	 * being too ill-treated, grant them a small fraction of the

2301

++	 * assigned budget before reducing the waiting time to

2302

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2303

++	 */

2304

++	sl = bfqd->bfq_slice_idle;

2305

++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

2306

++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

2307

++	    bfqq->raising_coeff == 1)

2308

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2309

++	else if (bfqq->raising_coeff > 1)

2310

++		sl = sl * 3;

2311

++	bfqd->last_idling_start = ktime_get();

2312

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2313

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2314

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2315

++}

2316

++

2317

++/*

2318

++ * Set the maximum time for the in-service queue to consume its

2319

++ * budget. This prevents seeky processes from lowering the disk

2320

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2321

++ */

2322

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2323

++{

2324

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2325

++	unsigned int timeout_coeff;

2326

++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

2327

++		timeout_coeff = 1;

2328

++	else

2329

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2330

++

2331

++	bfqd->last_budget_start = ktime_get();

2332

++

2333

++	bfq_clear_bfqq_budget_new(bfqq);

2334

++	bfqq->budget_timeout = jiffies +

2335

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2336

++

2337

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2338

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2339

++		timeout_coeff));

2340

++}

2341

++

2342

++/*

2343

++ * Move request from internal lists to the request queue dispatch list.

2344

++ */

2345

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2346

++{

2347

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2348

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2349

++

2350

++	bfq_remove_request(rq);

2351

++	bfqq->dispatched++;

2352

++	elv_dispatch_sort(q, rq);

2353

++

2354

++	if (bfq_bfqq_sync(bfqq))

2355

++		bfqd->sync_flight++;

2356

++}

2357

++

2358

++/*

2359

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2360

++ */

2361

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2362

++{

2363

++	struct request *rq = NULL;

2364

++

2365

++	if (bfq_bfqq_fifo_expire(bfqq))

2366

++		return NULL;

2367

++

2368

++	bfq_mark_bfqq_fifo_expire(bfqq);

2369

++

2370

++	if (list_empty(&bfqq->fifo))

2371

++		return NULL;

2372

++

2373

++	rq = rq_entry_fifo(bfqq->fifo.next);

2374

++

2375

++	if (time_before(jiffies, rq_fifo_time(rq)))

2376

++		return NULL;

2377

++

2378

++	return rq;

2379

++}

2380

++

2381

++/*

2382

++ * Must be called with the queue_lock held.

2383

++ */

2384

++static int bfqq_process_refs(struct bfq_queue *bfqq)

2385

++{

2386

++	int process_refs, io_refs;

2387

++

2388

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2389

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2390

++	BUG_ON(process_refs < 0);

2391

++	return process_refs;

2392

++}

2393

++

2394

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2395

++{

2396

++	int process_refs, new_process_refs;

2397

++	struct bfq_queue *__bfqq;

2398

++

2399

++	/*

2400

++	 * If there are no process references on the new_bfqq, then it is

2401

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2402

++	 * may have dropped their last reference (not just their last process

2403

++	 * reference).

2404

++	 */

2405

++	if (!bfqq_process_refs(new_bfqq))

2406

++		return;

2407

++

2408

++	/* Avoid a circular list and skip interim queue merges. */

2409

++	while ((__bfqq = new_bfqq->new_bfqq)) {

2410

++		if (__bfqq == bfqq)

2411

++			return;

2412

++		new_bfqq = __bfqq;

2413

++	}

2414

++

2415

++	process_refs = bfqq_process_refs(bfqq);

2416

++	new_process_refs = bfqq_process_refs(new_bfqq);

2417

++	/*

2418

++	 * If the process for the bfqq has gone away, there is no

2419

++	 * sense in merging the queues.

2420

++	 */

2421

++	if (process_refs == 0 || new_process_refs == 0)

2422

++		return;

2423

++

2424

++	/*

2425

++	 * Merge in the direction of the lesser amount of work.

2426

++	 */

2427

++	if (new_process_refs >= process_refs) {

2428

++		bfqq->new_bfqq = new_bfqq;

2429

++		atomic_add(process_refs, &new_bfqq->ref);

2430

++	} else {

2431

++		new_bfqq->new_bfqq = bfqq;

2432

++		atomic_add(new_process_refs, &bfqq->ref);

2433

++	}

2434

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2435

++		new_bfqq->pid);

2436

++}

2437

++

2438

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2439

++{

2440

++	struct bfq_entity *entity = &bfqq->entity;

2441

++	return entity->budget - entity->service;

2442

++}

2443

++

2444

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2445

++{

2446

++	BUG_ON(bfqq != bfqd->in_service_queue);

2447

++

2448

++	__bfq_bfqd_reset_in_service(bfqd);

2449

++

2450

++	/*

2451

++	 * If this bfqq is shared between multiple processes, check

2452

++	 * to make sure that those processes are still issuing I/Os

2453

++	 * within the mean seek distance. If not, it may be time to

2454

++	 * break the queues apart again.

2455

++	 */

2456

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2457

++		bfq_mark_bfqq_split_coop(bfqq);

2458

++

2459

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2460

++		/*

2461

++		 * overloading budget_timeout field to store when

2462

++		 * the queue remains with no backlog, used by

2463

++		 * the weight-raising mechanism

2464

++		 */

2465

++		bfqq->budget_timeout = jiffies;

2466

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2467

++	} else {

2468

++		bfq_activate_bfqq(bfqd, bfqq);

2469

++		/*

2470

++		 * Resort priority tree of potential close cooperators.

2471

++		 */

2472

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2473

++	}

2474

++}

2475

++

2476

++/**

2477

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2478

++ * @bfqd: device data.

2479

++ * @bfqq: queue to update.

2480

++ * @reason: reason for expiration.

2481

++ *

2482

++ * Handle the feedback on @bfqq budget.  See the body for detailed

2483

++ * comments.

2484

++ */

2485

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2486

++				     struct bfq_queue *bfqq,

2487

++				     enum bfqq_expiration reason)

2488

++{

2489

++	struct request *next_rq;

2490

++	unsigned long budget, min_budget;

2491

++

2492

++	budget = bfqq->max_budget;

2493

++	min_budget = bfq_min_budget(bfqd);

2494

++

2495

++	BUG_ON(bfqq != bfqd->in_service_queue);

2496

++

2497

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2498

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2499

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2500

++		budget, bfq_min_budget(bfqd));

2501

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2502

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2503

++

2504

++	if (bfq_bfqq_sync(bfqq)) {

2505

++		switch (reason) {

2506

++		/*

2507

++		 * Caveat: in all the following cases we trade latency

2508

++		 * for throughput.

2509

++		 */

2510

++		case BFQ_BFQQ_TOO_IDLE:

2511

++			/*

2512

++			 * This is the only case where we may reduce

2513

++			 * the budget: if there is no requets of the

2514

++			 * process still waiting for completion, then

2515

++			 * we assume (tentatively) that the timer has

2516

++			 * expired because the batch of requests of

2517

++			 * the process could have been served with a

2518

++			 * smaller budget.  Hence, betting that

2519

++			 * process will behave in the same way when it

2520

++			 * becomes backlogged again, we reduce its

2521

++			 * next budget.  As long as we guess right,

2522

++			 * this budget cut reduces the latency

2523

++			 * experienced by the process.

2524

++			 *

2525

++			 * However, if there are still outstanding

2526

++			 * requests, then the process may have not yet

2527

++			 * issued its next request just because it is

2528

++			 * still waiting for the completion of some of

2529

++			 * the still oustanding ones.  So in this

2530

++			 * subcase we do not reduce its budget, on the

2531

++			 * contrary we increase it to possibly boost

2532

++			 * the throughput, as discussed in the

2533

++			 * comments to the BUDGET_TIMEOUT case.

2534

++			 */

2535

++			if (bfqq->dispatched > 0) /* still oustanding reqs */

2536

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2537

++			else {

2538

++				if (budget > 5 * min_budget)

2539

++					budget -= 4 * min_budget;

2540

++				else

2541

++					budget = min_budget;

2542

++			}

2543

++			break;

2544

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2545

++			/*

2546

++			 * We double the budget here because: 1) it

2547

++			 * gives the chance to boost the throughput if

2548

++			 * this is not a seeky process (which may have

2549

++			 * bumped into this timeout because of, e.g.,

2550

++			 * ZBR), 2) together with charge_full_budget

2551

++			 * it helps give seeky processes higher

2552

++			 * timestamps, and hence be served less

2553

++			 * frequently.

2554

++			 */

2555

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2556

++			break;

2557

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2558

++			/*

2559

++			 * The process still has backlog, and did not

2560

++			 * let either the budget timeout or the disk

2561

++			 * idling timeout expire. Hence it is not

2562

++			 * seeky, has a short thinktime and may be

2563

++			 * happy with a higher budget too. So

2564

++			 * definitely increase the budget of this good

2565

++			 * candidate to boost the disk throughput.

2566

++			 */

2567

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2568

++			break;

2569

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2570

++		       /*

2571

++			* Leave the budget unchanged.

2572

++			*/

2573

++		default:

2574

++			return;

2575

++		}

2576

++	} else /* async queue */

2577

++	    /* async queues get always the maximum possible budget

2578

++	     * (their ability to dispatch is limited by

2579

++	     * @bfqd->bfq_max_budget_async_rq).

2580

++	     */

2581

++		budget = bfqd->bfq_max_budget;

2582

++

2583

++	bfqq->max_budget = budget;

2584

++

2585

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2586

++	    bfqq->max_budget > bfqd->bfq_max_budget)

2587

++		bfqq->max_budget = bfqd->bfq_max_budget;

2588

++

2589

++	/*

2590

++	 * Make sure that we have enough budget for the next request.

2591

++	 * Since the finish time of the bfqq must be kept in sync with

2592

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2593

++	 * update.

2594

++	 */

2595

++	next_rq = bfqq->next_rq;

2596

++	if (next_rq != NULL)

2597

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2598

++					    bfq_serv_to_charge(next_rq, bfqq));

2599

++	else

2600

++		bfqq->entity.budget = bfqq->max_budget;

2601

++

2602

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2603

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2604

++			bfqq->entity.budget);

2605

++}

2606

++

2607

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2608

++{

2609

++	unsigned long max_budget;

2610

++

2611

++	/*

2612

++	 * The max_budget calculated when autotuning is equal to the

2613

++	 * amount of sectors transfered in timeout_sync at the

2614

++	 * estimated peak rate.

2615

++	 */

2616

++	max_budget = (unsigned long)(peak_rate * 1000 *

2617

++				     timeout >> BFQ_RATE_SHIFT);

2618

++

2619

++	return max_budget;

2620

++}

2621

++

2622

++/*

2623

++ * In addition to updating the peak rate, checks whether the process

2624

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2625

++ * to the budget timeout, to reduce the amount of service provided to

2626

++ * seeky processes, and hence reduce their chances to lower the

2627

++ * throughput. See the code for more details.

2628

++ */

2629

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2630

++				int compensate, enum bfqq_expiration reason)

2631

++{

2632

++	u64 bw, usecs, expected, timeout;

2633

++	ktime_t delta;

2634

++	int update = 0;

2635

++

2636

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2637

++		return 0;

2638

++

2639

++	if (compensate)

2640

++		delta = bfqd->last_idling_start;

2641

++	else

2642

++		delta = ktime_get();

2643

++	delta = ktime_sub(delta, bfqd->last_budget_start);

2644

++	usecs = ktime_to_us(delta);

2645

++

2646

++	/* Don't trust short/unrealistic values. */

2647

++	if (usecs < 100 || usecs >= LONG_MAX)

2648

++		return 0;

2649

++

2650

++	/*

2651

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2652

++	 * value to store the peak rate, in sectors per usec in fixed

2653

++	 * point math.  We do so to have enough precision in the estimate

2654

++	 * and to avoid overflows.

2655

++	 */

2656

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2657

++	do_div(bw, (unsigned long)usecs);

2658

++

2659

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2660

++

2661

++	/*

2662

++	 * Use only long (> 20ms) intervals to filter out spikes for

2663

++	 * the peak rate estimation.

2664

++	 */

2665

++	if (usecs > 20000) {

2666

++		if (bw > bfqd->peak_rate ||

2667

++		   (!BFQQ_SEEKY(bfqq) &&

2668

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2669

++			bfq_log(bfqd, "measured bw =%llu", bw);

2670

++			/*

2671

++			 * To smooth oscillations use a low-pass filter with

2672

++			 * alpha=7/8, i.e.,

2673

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

2674

++			 */

2675

++			do_div(bw, 8);

2676

++			if (bw == 0)

2677

++				return 0;

2678

++			bfqd->peak_rate *= 7;

2679

++			do_div(bfqd->peak_rate, 8);

2680

++			bfqd->peak_rate += bw;

2681

++			update = 1;

2682

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2683

++		}

2684

++

2685

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2686

++

2687

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2688

++			bfqd->peak_rate_samples++;

2689

++

2690

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

2691

++		    update && bfqd->bfq_user_max_budget == 0) {

2692

++			bfqd->bfq_max_budget =

2693

++				bfq_calc_max_budget(bfqd->peak_rate, timeout);

2694

++			bfq_log(bfqd, "new max_budget=%lu",

2695

++				bfqd->bfq_max_budget);

2696

++		}

2697

++	}

2698

++

2699

++	/*

2700

++	 * If the process has been served for a too short time

2701

++	 * interval to let its possible sequential accesses prevail on

2702

++	 * the initial seek time needed to move the disk head on the

2703

++	 * first sector it requested, then give the process a chance

2704

++	 * and for the moment return false.

2705

++	 */

2706

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

2707

++		return 0;

2708

++

2709

++	/*

2710

++	 * A process is considered ``slow'' (i.e., seeky, so that we

2711

++	 * cannot treat it fairly in the service domain, as it would

2712

++	 * slow down too much the other processes) if, when a slice

2713

++	 * ends for whatever reason, it has received service at a

2714

++	 * rate that would not be high enough to complete the budget

2715

++	 * before the budget timeout expiration.

2716

++	 */

2717

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

2718

++

2719

++	/*

2720

++	 * Caveat: processes doing IO in the slower disk zones will

2721

++	 * tend to be slow(er) even if not seeky. And the estimated

2722

++	 * peak rate will actually be an average over the disk

2723

++	 * surface. Hence, to not be too harsh with unlucky processes,

2724

++	 * we keep a budget/3 margin of safety before declaring a

2725

++	 * process slow.

2726

++	 */

2727

++	return expected > (4 * bfqq->entity.budget) / 3;

2728

++}

2729

++

2730

++/*

2731

++ * To be deemed as soft real-time, an application must meet two requirements.

2732

++ * The first is that the application must not require an average bandwidth

2733

++ * higher than the approximate bandwidth required to playback or record a

2734

++ * compressed high-definition video.

2735

++ * The next function is invoked on the completion of the last request of a

2736

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

2737

++ * that, if the next request of the application does not arrive before

2738

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

2739

++ *

2740

++ * The second requirement is that the request pattern of the application is

2741

++ * isochronous, i.e., that, after issuing a request or a batch of requests, the

2742

++ * application stops for a while, then issues a new batch, and so on. For this

2743

++ * reason the next function is invoked to compute soft_rt_next_start only for

2744

++ * applications that meet this requirement, whereas soft_rt_next_start is set

2745

++ * to infinity for applications that do not.

2746

++ *

2747

++ * Unfortunately, even a greedy application may happen to behave in an

2748

++ * isochronous way if several processes are competing for the CPUs. In fact,

2749

++ * in this scenario the application stops issuing requests while the CPUs are

2750

++ * busy serving other processes, then restarts, then stops again for a while,

2751

++ * and so on. In addition, if the disk achieves a low enough throughput with

2752

++ * the request pattern issued by the application, then the above bandwidth

2753

++ * requirement may happen to be met too. To prevent such a greedy application

2754

++ * to be deemed as soft real-time, a further rule is used in the computation

2755

++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

2756

++ * time plus the maximum time for which the arrival of a request is waited

2757

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

2758

++ * filters out greedy applications, as the latter issue instead their next

2759

++ * request as soon as possible after the last one has been completed (in

2760

++ * contrast, when a batch of requests is completed, a soft real-time

2761

++ * application spends some time processing data).

2762

++ *

2763

++ * Actually, the last filter may easily generate false positives if: only

2764

++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

2765

++ * both the following two cases occur:

2766

++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

2767

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

2768

++ *    HZ=100.

2769

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

2770

++ *    for a while, then suddenly 'jump' by several units to recover the lost

2771

++ *    increments. This seems to happen, e.g., inside virtual machines.

2772

++ * To address this issue, we do not use as a reference time interval just

2773

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

2774

++ * particular we add the minimum number of jiffies for which the filter seems

2775

++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

2776

++ */

2777

++static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

2778

++					     struct bfq_queue *bfqq)

2779

++{

2780

++	return max(bfqq->last_idle_bklogged +

2781

++		   HZ * bfqq->service_from_backlogged /

2782

++		   bfqd->bfq_raising_max_softrt_rate,

2783

++		   (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);

2784

++}

2785

++

2786

++/**

2787

++ * bfq_bfqq_expire - expire a queue.

2788

++ * @bfqd: device owning the queue.

2789

++ * @bfqq: the queue to expire.

2790

++ * @compensate: if true, compensate for the time spent idling.

2791

++ * @reason: the reason causing the expiration.

2792

++ *

2793

++ *

2794

++ * If the process associated to the queue is slow (i.e., seeky), or in

2795

++ * case of budget timeout, or, finally, if it is async, we

2796

++ * artificially charge it an entire budget (independently of the

2797

++ * actual service it received). As a consequence, the queue will get

2798

++ * higher timestamps than the correct ones upon reactivation, and

2799

++ * hence it will be rescheduled as if it had received more service

2800

++ * than what it actually received. In the end, this class of processes

2801

++ * will receive less service in proportion to how slowly they consume

2802

++ * their budgets (and hence how seriously they tend to lower the

2803

++ * throughput).

2804

++ *

2805

++ * In contrast, when a queue expires because it has been idling for

2806

++ * too much or because it exhausted its budget, we do not touch the

2807

++ * amount of service it has received. Hence when the queue will be

2808

++ * reactivated and its timestamps updated, the latter will be in sync

2809

++ * with the actual service received by the queue until expiration.

2810

++ *

2811

++ * Charging a full budget to the first type of queues and the exact

2812

++ * service to the others has the effect of using the WF2Q+ policy to

2813

++ * schedule the former on a timeslice basis, without violating the

2814

++ * service domain guarantees of the latter.

2815

++ */

2816

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

2817

++			    struct bfq_queue *bfqq,

2818

++			    int compensate,

2819

++			    enum bfqq_expiration reason)

2820

++{

2821

++	int slow;

2822

++	BUG_ON(bfqq != bfqd->in_service_queue);

2823

++

2824

++	/* Update disk peak rate for autotuning and check whether the

2825

++	 * process is slow (see bfq_update_peak_rate).

2826

++	 */

2827

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

2828

++

2829

++	/*

2830

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

2831

++	 * and async queues, to favor sequential sync workloads.

2832

++	 *

2833

++	 * Processes doing IO in the slower disk zones will tend to be

2834

++	 * slow(er) even if not seeky. Hence, since the estimated peak

2835

++	 * rate is actually an average over the disk surface, these

2836

++	 * processes may timeout just for bad luck. To avoid punishing

2837

++	 * them we do not charge a full budget to a process that

2838

++	 * succeeded in consuming at least 2/3 of its budget.

2839

++	 */

2840

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

2841

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

2842

++		bfq_bfqq_charge_full_budget(bfqq);

2843

++

2844

++	bfqq->service_from_backlogged += bfqq->entity.service;

2845

++

2846

++	if (bfqd->low_latency && bfqq->raising_coeff == 1)

2847

++		bfqq->last_rais_start_finish = jiffies;

2848

++

2849

++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

2850

++		if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&

2851

++		    reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {

2852

++			/*

2853

++			 * If we get here, then the request pattern is

2854

++			 * isochronous (see the comments to the function

2855

++			 * bfq_bfqq_softrt_next_start()). However, if the

2856

++			 * queue still has in-flight requests, then it is

2857

++			 * better to postpone the computation of next_start

2858

++			 * to the next request completion. In fact, if we

2859

++			 * computed it now, then the application might pass

2860

++			 * the greedy-application filter improperly, because

2861

++			 * the arrival of its next request may  happen to be

2862

++			 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

2863

++			 * not because the application is truly soft real-

2864

++			 * time, but just because the application is currently

2865

++			 * waiting for the completion of some request before

2866

++			 * issuing, as quickly as possible, its next request.

2867

++			 */

2868

++			if (bfqq->dispatched > 0) {

2869

++				bfqq->soft_rt_next_start = -1;

2870

++				bfq_mark_bfqq_softrt_update(bfqq);

2871

++			} else

2872

++				bfqq->soft_rt_next_start =

2873

++					bfq_bfqq_softrt_next_start(bfqd, bfqq);

2874

++		} else

2875

++			bfqq->soft_rt_next_start = -1; /* infinity */

2876

++	}

2877

++

2878

++	bfq_log_bfqq(bfqd, bfqq,

2879

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

2880

++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

2881

++

2882

++	/* Increase, decrease or leave budget unchanged according to reason */

2883

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

2884

++	__bfq_bfqq_expire(bfqd, bfqq);

2885

++}

2886

++

2887

++/*

2888

++ * Budget timeout is not implemented through a dedicated timer, but

2889

++ * just checked on request arrivals and completions, as well as on

2890

++ * idle timer expirations.

2891

++ */

2892

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

2893

++{

2894

++	if (bfq_bfqq_budget_new(bfqq))

2895

++		return 0;

2896

++

2897

++	if (time_before(jiffies, bfqq->budget_timeout))

2898

++		return 0;

2899

++

2900

++	return 1;

2901

++}

2902

++

2903

++/*

2904

++ * If we expire a queue that is waiting for the arrival of a new

2905

++ * request, we may prevent the fictitious timestamp backshifting that

2906

++ * allows the guarantees of the queue to be preserved (see [1] for

2907

++ * this tricky aspect). Hence we return true only if this condition

2908

++ * does not hold, or if the queue is slow enough to deserve only to be

2909

++ * kicked off for preserving a high throughput.

2910

++*/

2911

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

2912

++{

2913

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

2914

++		"may_budget_timeout: wr %d left %d timeout %d",

2915

++		bfq_bfqq_wait_request(bfqq),

2916

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

2917

++		bfq_bfqq_budget_timeout(bfqq));

2918

++

2919

++	return (!bfq_bfqq_wait_request(bfqq) ||

2920

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

2921

++		&&

2922

++		bfq_bfqq_budget_timeout(bfqq);

2923

++}

2924

++

2925

++/*

2926

++ * For weight-raised queues issuing sync requests, idling is always performed,

2927

++ * as this is instrumental in guaranteeing a high fraction of the throughput

2928

++ * to these queues, and hence in guaranteeing a lower latency for their

2929

++ * requests. See [1] for details.

2930

++ *

2931

++ * For non-weight-raised queues, idling is instead disabled if the device is

2932

++ * NCQ-enabled and non-rotational, as this boosts the throughput on such

2933

++ * devices.

2934

++ */

2935

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

2936

++{

2937

++	struct bfq_data *bfqd = bfqq->bfqd;

2938

++

2939

++	return bfq_bfqq_sync(bfqq) && (

2940

++		bfqq->raising_coeff > 1 ||

2941

++		(bfq_bfqq_idle_window(bfqq) &&

2942

++		 !(bfqd->hw_tag &&

2943

++		   (blk_queue_nonrot(bfqd->queue) ||

2944

++		 /*

2945

++		  * If there are weight-raised busy queues, then do not idle

2946

++		  * the disk for a sync non-weight-raised queue, and hence

2947

++		  * expire the queue immediately if empty. Combined with the

2948

++		  * timestamping rules of BFQ (see [1] for details), this

2949

++		  * causes sync non-weight-raised queues to get a lower

2950

++		  * fraction of the disk throughput, and hence reduces the rate

2951

++		  * at which the processes associated to these queues ask for

2952

++		  * requests from the request pool.

2953

++		  *

2954

++		  * This is beneficial for weight-raised processes, when the

2955

++		  * system operates in request-pool saturation conditions

2956

++		  * (e.g., in the presence of write hogs). In fact, if

2957

++		  * non-weight-raised processes ask for requests at a lower

2958

++		  * rate, then weight-raised processes have a higher

2959

++		  * probability to get a request from the pool immediately

2960

++		  * (or at least soon) when they need one. Hence they have a

2961

++		  * higher probability to actually get a fraction of the disk

2962

++		  * throughput proportional to their high weight. This is

2963

++		  * especially true with NCQ-enabled drives, which enqueue

2964

++		  * several requests in advance and further reorder

2965

++		  * internally-queued requests.

2966

++		  *

2967

++		  * Mistreating non-weight-raised queues in the above-described

2968

++		  * way, when there are busy weight-raised queues, seems to

2969

++		  * mitigate starvation problems in the presence of heavy write

2970

++		  * workloads and NCQ, and hence to guarantee a higher

2971

++		  * application and system responsiveness in these hostile

2972

++		  * scenarios.

2973

++		  */

2974

++		    bfqd->raised_busy_queues > 0)

2975

++		  )

2976

++		)

2977

++	);

2978

++}

2979

++

2980

++/*

2981

++ * If the in-service queue is empty, but it is sync and either of the following

2982

++ * conditions holds, then: 1) the queue must remain in service and cannot be

2983

++ * expired, and 2) the disk must be idled to wait for the possible arrival

2984

++ * of a new request for the queue. The conditions are:

2985

++ * - the device is rotational and not performing NCQ, and the queue has its

2986

++ *   idle window set (in this case, waiting for a new request for the queue

2987

++ *   is likely to boost the disk throughput);

2988

++ * - the queue is weight-raised (waiting for the request is necessary to

2989

++ *   provide the queue with fairness and latency guarantees, see [1] for

2990

++ *   details).

2991

++ */

2992

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

2993

++{

2994

++	struct bfq_data *bfqd = bfqq->bfqd;

2995

++

2996

++	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

2997

++		bfq_bfqq_must_not_expire(bfqq) &&

2998

++		!bfq_queue_nonrot_noidle(bfqd, bfqq));

2999

++}

3000

++

3001

++/*

3002

++ * Select a queue for service.  If we have a current queue in service,

3003

++ * check whether to continue servicing it, or retrieve and set a new one.

3004

++ */

3005

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3006

++{

3007

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3008

++	struct request *next_rq;

3009

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3010

++

3011

++	bfqq = bfqd->in_service_queue;

3012

++	if (bfqq == NULL)

3013

++		goto new_queue;

3014

++

3015

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3016

++

3017

++	/*

3018

++         * If another queue has a request waiting within our mean seek

3019

++         * distance, let it run. The expire code will check for close

3020

++         * cooperators and put the close queue at the front of the

3021

++         * service tree. If possible, merge the expiring queue with the

3022

++         * new bfqq.

3023

++         */

3024

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3025

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3026

++                bfq_setup_merge(bfqq, new_bfqq);

3027

++

3028

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3029

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3030

++	    !bfq_bfqq_must_idle(bfqq))

3031

++		goto expire;

3032

++

3033

++	next_rq = bfqq->next_rq;

3034

++	/*

3035

++	 * If bfqq has requests queued and it has enough budget left to

3036

++	 * serve them, keep the queue, otherwise expire it.

3037

++	 */

3038

++	if (next_rq != NULL) {

3039

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3040

++			bfq_bfqq_budget_left(bfqq)) {

3041

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3042

++			goto expire;

3043

++		} else {

3044

++			/*

3045

++			 * The idle timer may be pending because we may not

3046

++			 * disable disk idling even when a new request arrives

3047

++			 */

3048

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3049

++				/*

3050

++				 * If we get here: 1) at least a new request

3051

++				 * has arrived but we have not disabled the

3052

++				 * timer because the request was too small,

3053

++				 * 2) then the block layer has unplugged the

3054

++				 * device, causing the dispatch to be invoked.

3055

++				 *

3056

++				 * Since the device is unplugged, now the

3057

++				 * requests are probably large enough to

3058

++				 * provide a reasonable throughput.

3059

++				 * So we disable idling.

3060

++				 */

3061

++				bfq_clear_bfqq_wait_request(bfqq);

3062

++				del_timer(&bfqd->idle_slice_timer);

3063

++			}

3064

++			if (new_bfqq == NULL)

3065

++				goto keep_queue;

3066

++			else

3067

++				goto expire;

3068

++		}

3069

++	}

3070

++

3071

++	/*

3072

++	 * No requests pending.  If the in-service queue has no cooperator and

3073

++	 * still has requests in flight (possibly waiting for a completion)

3074

++	 * or is idling for a new request, then keep it.

3075

++	 */

3076

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3077

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3078

++		bfqq = NULL;

3079

++		goto keep_queue;

3080

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3081

++		/*

3082

++		 * Expiring the queue because there is a close cooperator,

3083

++		 * cancel timer.

3084

++		 */

3085

++		bfq_clear_bfqq_wait_request(bfqq);

3086

++		del_timer(&bfqd->idle_slice_timer);

3087

++	}

3088

++

3089

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3090

++expire:

3091

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3092

++new_queue:

3093

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3094

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3095

++		bfqq != NULL ? bfqq->pid : 0);

3096

++keep_queue:

3097

++	return bfqq;

3098

++}

3099

++

3100

++static void bfq_update_raising_data(struct bfq_data *bfqd,

3101

++				    struct bfq_queue *bfqq)

3102

++{

3103

++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

3104

++		struct bfq_entity *entity = &bfqq->entity;

3105

++

3106

++		bfq_log_bfqq(bfqd, bfqq,

3107

++			"raising period dur %u/%u msec, "

3108

++			"old raising coeff %u, w %d(%d)",

3109

++			jiffies_to_msecs(jiffies -

3110

++				bfqq->last_rais_start_finish),

3111

++			jiffies_to_msecs(bfqq->raising_cur_max_time),

3112

++			bfqq->raising_coeff,

3113

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3114

++

3115

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3116

++			entity->orig_weight * bfqq->raising_coeff);

3117

++		if (entity->ioprio_changed)

3118

++			bfq_log_bfqq(bfqd, bfqq,

3119

++			"WARN: pending prio change");

3120

++		/*

3121

++		 * If too much time has elapsed from the beginning

3122

++		 * of this weight-raising, stop it.

3123

++		 */

3124

++		if (jiffies - bfqq->last_rais_start_finish >

3125

++			bfqq->raising_cur_max_time) {

3126

++			bfqq->last_rais_start_finish = jiffies;

3127

++			bfq_log_bfqq(bfqd, bfqq,

3128

++				     "wrais ending at %llu msec,"

3129

++				     "rais_max_time %u",

3130

++				     bfqq->last_rais_start_finish,

3131

++				     jiffies_to_msecs(bfqq->

3132

++					raising_cur_max_time));

3133

++			bfq_bfqq_end_raising(bfqq);

3134

++			__bfq_entity_update_weight_prio(

3135

++				bfq_entity_service_tree(entity),

3136

++				entity);

3137

++		}

3138

++	}

3139

++}

3140

++

3141

++/*

3142

++ * Dispatch one request from bfqq, moving it to the request queue

3143

++ * dispatch list.

3144

++ */

3145

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3146

++				struct bfq_queue *bfqq)

3147

++{

3148

++	int dispatched = 0;

3149

++	struct request *rq;

3150

++	unsigned long service_to_charge;

3151

++

3152

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3153

++

3154

++	/* Follow expired path, else get first next available. */

3155

++	rq = bfq_check_fifo(bfqq);

3156

++	if (rq == NULL)

3157

++		rq = bfqq->next_rq;

3158

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3159

++

3160

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3161

++		/*

3162

++		 * This may happen if the next rq is chosen

3163

++		 * in fifo order instead of sector order.

3164

++		 * The budget is properly dimensioned

3165

++		 * to be always sufficient to serve the next request

3166

++		 * only if it is chosen in sector order. The reason is

3167

++		 * that it would be quite inefficient and little useful

3168

++		 * to always make sure that the budget is large enough

3169

++		 * to serve even the possible next rq in fifo order.

3170

++		 * In fact, requests are seldom served in fifo order.

3171

++		 *

3172

++		 * Expire the queue for budget exhaustion, and

3173

++		 * make sure that the next act_budget is enough

3174

++		 * to serve the next request, even if it comes

3175

++		 * from the fifo expired path.

3176

++		 */

3177

++		bfqq->next_rq = rq;

3178

++		/*

3179

++		 * Since this dispatch is failed, make sure that

3180

++		 * a new one will be performed

3181

++		 */

3182

++		if (!bfqd->rq_in_driver)

3183

++			bfq_schedule_dispatch(bfqd);

3184

++		goto expire;

3185

++	}

3186

++

3187

++	/* Finally, insert request into driver dispatch list. */

3188

++	bfq_bfqq_served(bfqq, service_to_charge);

3189

++	bfq_dispatch_insert(bfqd->queue, rq);

3190

++

3191

++	bfq_update_raising_data(bfqd, bfqq);

3192

++

3193

++	bfq_log_bfqq(bfqd, bfqq,

3194

++			"dispatched %u sec req (%llu), budg left %lu",

3195

++			blk_rq_sectors(rq),

3196

++			(long long unsigned)blk_rq_pos(rq),

3197

++			bfq_bfqq_budget_left(bfqq));

3198

++

3199

++	dispatched++;

3200

++

3201

++	if (bfqd->in_service_bic == NULL) {

3202

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3203

++		bfqd->in_service_bic = RQ_BIC(rq);

3204

++	}

3205

++

3206

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3207

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3208

++	    bfq_class_idle(bfqq)))

3209

++		goto expire;

3210

++

3211

++	return dispatched;

3212

++

3213

++expire:

3214

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3215

++	return dispatched;

3216

++}

3217

++

3218

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3219

++{

3220

++	int dispatched = 0;

3221

++

3222

++	while (bfqq->next_rq != NULL) {

3223

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3224

++		dispatched++;

3225

++	}

3226

++

3227

++	BUG_ON(!list_empty(&bfqq->fifo));

3228

++	return dispatched;

3229

++}

3230

++

3231

++/*

3232

++ * Drain our current requests.  Used for barriers and when switching

3233

++ * io schedulers on-the-fly.

3234

++ */

3235

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3236

++{

3237

++	struct bfq_queue *bfqq, *n;

3238

++	struct bfq_service_tree *st;

3239

++	int dispatched = 0;

3240

++

3241

++	bfqq = bfqd->in_service_queue;

3242

++	if (bfqq != NULL)

3243

++		__bfq_bfqq_expire(bfqd, bfqq);

3244

++

3245

++	/*

3246

++	 * Loop through classes, and be careful to leave the scheduler

3247

++	 * in a consistent state, as feedback mechanisms and vtime

3248

++	 * updates cannot be disabled during the process.

3249

++	 */

3250

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3251

++		st = bfq_entity_service_tree(&bfqq->entity);

3252

++

3253

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3254

++		bfqq->max_budget = bfq_max_budget(bfqd);

3255

++

3256

++		bfq_forget_idle(st);

3257

++	}

3258

++

3259

++	BUG_ON(bfqd->busy_queues != 0);

3260

++

3261

++	return dispatched;

3262

++}

3263

++

3264

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3265

++{

3266

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3267

++	struct bfq_queue *bfqq;

3268

++	int max_dispatch;

3269

++

3270

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3271

++	if (bfqd->busy_queues == 0)

3272

++		return 0;

3273

++

3274

++	if (unlikely(force))

3275

++		return bfq_forced_dispatch(bfqd);

3276

++

3277

++	bfqq = bfq_select_queue(bfqd);

3278

++	if (bfqq == NULL)

3279

++		return 0;

3280

++

3281

++	max_dispatch = bfqd->bfq_quantum;

3282

++	if (bfq_class_idle(bfqq))

3283

++		max_dispatch = 1;

3284

++

3285

++	if (!bfq_bfqq_sync(bfqq))

3286

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3287

++

3288

++	if (bfqq->dispatched >= max_dispatch) {

3289

++		if (bfqd->busy_queues > 1)

3290

++			return 0;

3291

++		if (bfqq->dispatched >= 4 * max_dispatch)

3292

++			return 0;

3293

++	}

3294

++

3295

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3296

++		return 0;

3297

++

3298

++	bfq_clear_bfqq_wait_request(bfqq);

3299

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3300

++

3301

++	if (!bfq_dispatch_request(bfqd, bfqq))

3302

++		return 0;

3303

++

3304

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

3305

++			bfqq->pid, max_dispatch);

3306

++

3307

++	return 1;

3308

++}

3309

++

3310

++/*

3311

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3312

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3313

++ *

3314

++ * Queue lock must be held here.

3315

++ */

3316

++static void bfq_put_queue(struct bfq_queue *bfqq)

3317

++{

3318

++	struct bfq_data *bfqd = bfqq->bfqd;

3319

++

3320

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3321

++

3322

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3323

++		     atomic_read(&bfqq->ref));

3324

++	if (!atomic_dec_and_test(&bfqq->ref))

3325

++		return;

3326

++

3327

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3328

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3329

++	BUG_ON(bfqq->entity.tree != NULL);

3330

++	BUG_ON(bfq_bfqq_busy(bfqq));

3331

++	BUG_ON(bfqd->in_service_queue == bfqq);

3332

++

3333

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3334

++

3335

++	kmem_cache_free(bfq_pool, bfqq);

3336

++}

3337

++

3338

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

3339

++{

3340

++	struct bfq_queue *__bfqq, *next;

3341

++

3342

++	/*

3343

++	 * If this queue was scheduled to merge with another queue, be

3344

++	 * sure to drop the reference taken on that queue (and others in

3345

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3346

++	 */

3347

++	__bfqq = bfqq->new_bfqq;

3348

++	while (__bfqq) {

3349

++		if (__bfqq == bfqq) {

3350

++			WARN(1, "bfqq->new_bfqq loop detected.\n");

3351

++			break;

3352

++		}

3353

++		next = __bfqq->new_bfqq;

3354

++		bfq_put_queue(__bfqq);

3355

++		__bfqq = next;

3356

++	}

3357

++}

3358

++

3359

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3360

++{

3361

++	if (bfqq == bfqd->in_service_queue) {

3362

++		__bfq_bfqq_expire(bfqd, bfqq);

3363

++		bfq_schedule_dispatch(bfqd);

3364

++	}

3365

++

3366

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3367

++		     atomic_read(&bfqq->ref));

3368

++

3369

++	bfq_put_cooperator(bfqq);

3370

++

3371

++	bfq_put_queue(bfqq);

3372

++}

3373

++

3374

++static void bfq_init_icq(struct io_cq *icq)

3375

++{

3376

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3377

++

3378

++	bic->ttime.last_end_request = jiffies;

3379

++}

3380

++

3381

++static void bfq_exit_icq(struct io_cq *icq)

3382

++{

3383

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3384

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3385

++

3386

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3387

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3388

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3389

++	}

3390

++

3391

++	if (bic->bfqq[BLK_RW_SYNC]) {

3392

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3393

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3394

++	}

3395

++}

3396

++

3397

++/*

3398

++ * Update the entity prio values; note that the new values will not

3399

++ * be used until the next (re)activation.

3400

++ */

3401

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3402

++{

3403

++	struct task_struct *tsk = current;

3404

++	int ioprio_class;

3405

++

3406

++	if (!bfq_bfqq_prio_changed(bfqq))

3407

++		return;

3408

++

3409

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3410

++	switch (ioprio_class) {

3411

++	default:

3412

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3413

++			"bfq: bad prio %x\n", ioprio_class);

3414

++	case IOPRIO_CLASS_NONE:

3415

++		/*

3416

++		 * No prio set, inherit CPU scheduling settings.

3417

++		 */

3418

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3419

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3420

++		break;

3421

++	case IOPRIO_CLASS_RT:

3422

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3423

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3424

++		break;

3425

++	case IOPRIO_CLASS_BE:

3426

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3427

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3428

++		break;

3429

++	case IOPRIO_CLASS_IDLE:

3430

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3431

++		bfqq->entity.new_ioprio = 7;

3432

++		bfq_clear_bfqq_idle_window(bfqq);

3433

++		break;

3434

++	}

3435

++

3436

++	bfqq->entity.ioprio_changed = 1;

3437

++

3438

++	/*

3439

++	 * Keep track of original prio settings in case we have to temporarily

3440

++	 * elevate the priority of this queue.

3441

++	 */

3442

++	bfqq->org_ioprio = bfqq->entity.new_ioprio;

3443

++	bfq_clear_bfqq_prio_changed(bfqq);

3444

++}

3445

++

3446

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3447

++{

3448

++	struct bfq_data *bfqd;

3449

++	struct bfq_queue *bfqq, *new_bfqq;

3450

++	struct bfq_group *bfqg;

3451

++	unsigned long uninitialized_var(flags);

3452

++	int ioprio = bic->icq.ioc->ioprio;

3453

++

3454

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3455

++				   &flags);

3456

++	/*

3457

++	 * This condition may trigger on a newly created bic, be sure to drop

3458

++	 * the lock before returning.

3459

++	 */

3460

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3461

++		goto out;

3462

++

3463

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

3464

++	if (bfqq != NULL) {

3465

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3466

++				    sched_data);

3467

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3468

++					 GFP_ATOMIC);

3469

++		if (new_bfqq != NULL) {

3470

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3471

++			bfq_log_bfqq(bfqd, bfqq,

3472

++				     "changed_ioprio: bfqq %p %d",

3473

++				     bfqq, atomic_read(&bfqq->ref));

3474

++			bfq_put_queue(bfqq);

3475

++		}

3476

++	}

3477

++

3478

++	bfqq = bic->bfqq[BLK_RW_SYNC];

3479

++	if (bfqq != NULL)

3480

++		bfq_mark_bfqq_prio_changed(bfqq);

3481

++

3482

++	bic->ioprio = ioprio;

3483

++

3484

++out:

3485

++	bfq_put_bfqd_unlock(bfqd, &flags);

3486

++}

3487

++

3488

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3489

++			  pid_t pid, int is_sync)

3490

++{

3491

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3492

++	INIT_LIST_HEAD(&bfqq->fifo);

3493

++

3494

++	atomic_set(&bfqq->ref, 0);

3495

++	bfqq->bfqd = bfqd;

3496

++

3497

++	bfq_mark_bfqq_prio_changed(bfqq);

3498

++

3499

++	if (is_sync) {

3500

++		if (!bfq_class_idle(bfqq))

3501

++			bfq_mark_bfqq_idle_window(bfqq);

3502

++		bfq_mark_bfqq_sync(bfqq);

3503

++	}

3504

++

3505

++	/* Tentative initial value to trade off between thr and lat */

3506

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3507

++	bfqq->pid = pid;

3508

++

3509

++	bfqq->raising_coeff = 1;

3510

++	bfqq->last_rais_start_finish = 0;

3511

++	bfqq->soft_rt_next_start = -1;

3512

++}

3513

++

3514

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

3515

++					      struct bfq_group *bfqg,

3516

++					      int is_sync,

3517

++					      struct bfq_io_cq *bic,

3518

++					      gfp_t gfp_mask)

3519

++{

3520

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3521

++

3522

++retry:

3523

++	/* bic always exists here */

3524

++	bfqq = bic_to_bfqq(bic, is_sync);

3525

++

3526

++	/*

3527

++	 * Always try a new alloc if we fall back to the OOM bfqq

3528

++	 * originally, since it should just be a temporary situation.

3529

++	 */

3530

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3531

++		bfqq = NULL;

3532

++		if (new_bfqq != NULL) {

3533

++			bfqq = new_bfqq;

3534

++			new_bfqq = NULL;

3535

++		} else if (gfp_mask & __GFP_WAIT) {

3536

++			spin_unlock_irq(bfqd->queue->queue_lock);

3537

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

3538

++					gfp_mask | __GFP_ZERO,

3539

++					bfqd->queue->node);

3540

++			spin_lock_irq(bfqd->queue->queue_lock);

3541

++			if (new_bfqq != NULL)

3542

++				goto retry;

3543

++		} else {

3544

++			bfqq = kmem_cache_alloc_node(bfq_pool,

3545

++					gfp_mask | __GFP_ZERO,

3546

++					bfqd->queue->node);

3547

++		}

3548

++

3549

++		if (bfqq != NULL) {

3550

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

3551

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

3552

++		} else {

3553

++			bfqq = &bfqd->oom_bfqq;

3554

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

3555

++		}

3556

++

3557

++		bfq_init_prio_data(bfqq, bic);

3558

++		bfq_init_entity(&bfqq->entity, bfqg);

3559

++	}

3560

++

3561

++	if (new_bfqq != NULL)

3562

++		kmem_cache_free(bfq_pool, new_bfqq);

3563

++

3564

++	return bfqq;

3565

++}

3566

++

3567

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

3568

++					       struct bfq_group *bfqg,

3569

++					       int ioprio_class, int ioprio)

3570

++{

3571

++	switch (ioprio_class) {

3572

++	case IOPRIO_CLASS_RT:

3573

++		return &bfqg->async_bfqq[0][ioprio];

3574

++	case IOPRIO_CLASS_NONE:

3575

++		ioprio = IOPRIO_NORM;

3576

++		/* fall through */

3577

++	case IOPRIO_CLASS_BE:

3578

++		return &bfqg->async_bfqq[1][ioprio];

3579

++	case IOPRIO_CLASS_IDLE:

3580

++		return &bfqg->async_idle_bfqq;

3581

++	default:

3582

++		BUG();

3583

++	}

3584

++}

3585

++

3586

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

3587

++				       struct bfq_group *bfqg, int is_sync,

3588

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

3589

++{

3590

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3591

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3592

++	struct bfq_queue **async_bfqq = NULL;

3593

++	struct bfq_queue *bfqq = NULL;

3594

++

3595

++	if (!is_sync) {

3596

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

3597

++						  ioprio);

3598

++		bfqq = *async_bfqq;

3599

++	}

3600

++

3601

++	if (bfqq == NULL)

3602

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3603

++

3604

++	/*

3605

++	 * Pin the queue now that it's allocated, scheduler exit will prune it.

3606

++	 */

3607

++	if (!is_sync && *async_bfqq == NULL) {

3608

++		atomic_inc(&bfqq->ref);

3609

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

3610

++			     bfqq, atomic_read(&bfqq->ref));

3611

++		*async_bfqq = bfqq;

3612

++	}

3613

++

3614

++	atomic_inc(&bfqq->ref);

3615

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

3616

++		     atomic_read(&bfqq->ref));

3617

++	return bfqq;

3618

++}

3619

++

3620

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

3621

++				    struct bfq_io_cq *bic)

3622

++{

3623

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

3624

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

3625

++

3626

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

3627

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

3628

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

3629

++				bic->ttime.ttime_samples;

3630

++}

3631

++

3632

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

3633

++				   struct bfq_queue *bfqq,

3634

++				   struct request *rq)

3635

++{

3636

++	sector_t sdist;

3637

++	u64 total;

3638

++

3639

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

3640

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

3641

++	else

3642

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

3643

++

3644

++	/*

3645

++	 * Don't allow the seek distance to get too large from the

3646

++	 * odd fragment, pagein, etc.

3647

++	 */

3648

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

3649

++		sdist = 0;

3650

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

3651

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

3652

++	else

3653

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

3654

++

3655

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

3656

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

3657

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

3658

++	do_div(total, bfqq->seek_samples);

3659

++	bfqq->seek_mean = (sector_t)total;

3660

++

3661

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

3662

++			(u64)bfqq->seek_mean);

3663

++}

3664

++

3665

++/*

3666

++ * Disable idle window if the process thinks too long or seeks so much that

3667

++ * it doesn't matter.

3668

++ */

3669

++static void bfq_update_idle_window(struct bfq_data *bfqd,

3670

++				   struct bfq_queue *bfqq,

3671

++				   struct bfq_io_cq *bic)

3672

++{

3673

++	int enable_idle;

3674

++

3675

++	/* Don't idle for async or idle io prio class. */

3676

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

3677

++		return;

3678

++

3679

++	enable_idle = bfq_bfqq_idle_window(bfqq);

3680

++

3681

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

3682

++	    bfqd->bfq_slice_idle == 0 ||

3683

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

3684

++			bfqq->raising_coeff == 1))

3685

++		enable_idle = 0;

3686

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

3687

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

3688

++			bfqq->raising_coeff == 1)

3689

++			enable_idle = 0;

3690

++		else

3691

++			enable_idle = 1;

3692

++	}

3693

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

3694

++		enable_idle);

3695

++

3696

++	if (enable_idle)

3697

++		bfq_mark_bfqq_idle_window(bfqq);

3698

++	else

3699

++		bfq_clear_bfqq_idle_window(bfqq);

3700

++}

3701

++

3702

++/*

3703

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

3704

++ * something we should do about it.

3705

++ */

3706

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3707

++			    struct request *rq)

3708

++{

3709

++	struct bfq_io_cq *bic = RQ_BIC(rq);

3710

++

3711

++	if (rq->cmd_flags & REQ_META)

3712

++		bfqq->meta_pending++;

3713

++

3714

++	bfq_update_io_thinktime(bfqd, bic);

3715

++	bfq_update_io_seektime(bfqd, bfqq, rq);

3716

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

3717

++	    !BFQQ_SEEKY(bfqq))

3718

++		bfq_update_idle_window(bfqd, bfqq, bic);

3719

++

3720

++	bfq_log_bfqq(bfqd, bfqq,

3721

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

3722

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

3723

++		     (long long unsigned)bfqq->seek_mean);

3724

++

3725

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3726

++

3727

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

3728

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

3729

++				blk_rq_sectors(rq) < 32;

3730

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

3731

++

3732

++		/*

3733

++		 * There is just this request queued: if the request

3734

++		 * is small and the queue is not to be expired, then

3735

++		 * just exit.

3736

++		 *

3737

++		 * In this way, if the disk is being idled to wait for

3738

++		 * a new request from the in-service queue, we avoid

3739

++		 * unplugging the device and committing the disk to serve

3740

++		 * just a small request. On the contrary, we wait for

3741

++		 * the block layer to decide when to unplug the device:

3742

++		 * hopefully, new requests will be merged to this one

3743

++		 * quickly, then the device will be unplugged and

3744

++		 * larger requests will be dispatched.

3745

++		 */

3746

++		if (small_req && !budget_timeout)

3747

++			return;

3748

++

3749

++		/*

3750

++		 * A large enough request arrived, or the queue is to

3751

++		 * be expired: in both cases disk idling is to be

3752

++		 * stopped, so clear wait_request flag and reset

3753

++		 * timer.

3754

++		 */

3755

++		bfq_clear_bfqq_wait_request(bfqq);

3756

++		del_timer(&bfqd->idle_slice_timer);

3757

++

3758

++		/*

3759

++		 * The queue is not empty, because a new request just

3760

++		 * arrived. Hence we can safely expire the queue, in

3761

++		 * case of budget timeout, without risking that the

3762

++		 * timestamps of the queue are not updated correctly.

3763

++		 * See [1] for more details.

3764

++		 */

3765

++		if (budget_timeout)

3766

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3767

++

3768

++		/*

3769

++		 * Let the request rip immediately, or let a new queue be

3770

++		 * selected if bfqq has just been expired.

3771

++		 */

3772

++		__blk_run_queue(bfqd->queue);

3773

++	}

3774

++}

3775

++

3776

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

3777

++{

3778

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3779

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3780

++

3781

++	assert_spin_locked(bfqd->queue->queue_lock);

3782

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

3783

++

3784

++	bfq_add_rq_rb(rq);

3785

++

3786

++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

3787

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

3788

++

3789

++	bfq_rq_enqueued(bfqd, bfqq, rq);

3790

++}

3791

++

3792

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

3793

++{

3794

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

3795

++				     bfqd->rq_in_driver);

3796

++

3797

++	if (bfqd->hw_tag == 1)

3798

++		return;

3799

++

3800

++	/*

3801

++	 * This sample is valid if the number of outstanding requests

3802

++	 * is large enough to allow a queueing behavior.  Note that the

3803

++	 * sum is not exact, as it's not taking into account deactivated

3804

++	 * requests.

3805

++	 */

3806

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

3807

++		return;

3808

++

3809

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

3810

++		return;

3811

++

3812

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

3813

++	bfqd->max_rq_in_driver = 0;

3814

++	bfqd->hw_tag_samples = 0;

3815

++}

3816

++

3817

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

3818

++{

3819

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3820

++	struct bfq_data *bfqd = bfqq->bfqd;

3821

++	const int sync = rq_is_sync(rq);

3822

++

3823

++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

3824

++			blk_rq_sectors(rq), sync);

3825

++

3826

++	bfq_update_hw_tag(bfqd);

3827

++

3828

++	WARN_ON(!bfqd->rq_in_driver);

3829

++	WARN_ON(!bfqq->dispatched);

3830

++	bfqd->rq_in_driver--;

3831

++	bfqq->dispatched--;

3832

++

3833

++	if (bfq_bfqq_sync(bfqq))

3834

++		bfqd->sync_flight--;

3835

++

3836

++	if (sync)

3837

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

3838

++

3839

++	/*

3840

++	 * The computation of softrt_next_start was scheduled for the next

3841

++	 * request completion: it is now time to compute it.

3842

++	 */

3843

++	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

3844

++		bfqq->soft_rt_next_start =

3845

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

3846

++

3847

++	/*

3848

++	 * If this is the in-service queue, check if it needs to be expired,

3849

++	 * or if we want to idle in case it has no pending requests.

3850

++	 */

3851

++	if (bfqd->in_service_queue == bfqq) {

3852

++		if (bfq_bfqq_budget_new(bfqq))

3853

++			bfq_set_budget_timeout(bfqd);

3854

++

3855

++		if (bfq_bfqq_must_idle(bfqq)) {

3856

++			bfq_arm_slice_timer(bfqd);

3857

++			goto out;

3858

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

3859

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3860

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

3861

++			 (bfqq->dispatched == 0 ||

3862

++			  !bfq_bfqq_must_not_expire(bfqq)))

3863

++			bfq_bfqq_expire(bfqd, bfqq, 0,

3864

++					BFQ_BFQQ_NO_MORE_REQUESTS);

3865

++	}

3866

++

3867

++	if (!bfqd->rq_in_driver)

3868

++		bfq_schedule_dispatch(bfqd);

3869

++

3870

++out:

3871

++	return;

3872

++}

3873

++

3874

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

3875

++{

3876

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

3877

++		bfq_clear_bfqq_must_alloc(bfqq);

3878

++		return ELV_MQUEUE_MUST;

3879

++	}

3880

++

3881

++	return ELV_MQUEUE_MAY;

3882

++}

3883

++

3884

++static int bfq_may_queue(struct request_queue *q, int rw)

3885

++{

3886

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3887

++	struct task_struct *tsk = current;

3888

++	struct bfq_io_cq *bic;

3889

++	struct bfq_queue *bfqq;

3890

++

3891

++	/*

3892

++	 * Don't force setup of a queue from here, as a call to may_queue

3893

++	 * does not necessarily imply that a request actually will be queued.

3894

++	 * So just lookup a possibly existing queue, or return 'may queue'

3895

++	 * if that fails.

3896

++	 */

3897

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

3898

++	if (bic == NULL)

3899

++		return ELV_MQUEUE_MAY;

3900

++

3901

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

3902

++	if (bfqq != NULL) {

3903

++		bfq_init_prio_data(bfqq, bic);

3904

++

3905

++		return __bfq_may_queue(bfqq);

3906

++	}

3907

++

3908

++	return ELV_MQUEUE_MAY;

3909

++}

3910

++

3911

++/*

3912

++ * Queue lock held here.

3913

++ */

3914

++static void bfq_put_request(struct request *rq)

3915

++{

3916

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3917

++

3918

++	if (bfqq != NULL) {

3919

++		const int rw = rq_data_dir(rq);

3920

++

3921

++		BUG_ON(!bfqq->allocated[rw]);

3922

++		bfqq->allocated[rw]--;

3923

++

3924

++		rq->elv.priv[0] = NULL;

3925

++		rq->elv.priv[1] = NULL;

3926

++

3927

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

3928

++			     bfqq, atomic_read(&bfqq->ref));

3929

++		bfq_put_queue(bfqq);

3930

++	}

3931

++}

3932

++

3933

++static struct bfq_queue *

3934

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

3935

++		struct bfq_queue *bfqq)

3936

++{

3937

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

3938

++		(long unsigned)bfqq->new_bfqq->pid);

3939

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

3940

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

3941

++	bfq_put_queue(bfqq);

3942

++	return bic_to_bfqq(bic, 1);

3943

++}

3944

++

3945

++/*

3946

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

3947

++ * was the last process referring to said bfqq.

3948

++ */

3949

++static struct bfq_queue *

3950

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

3951

++{

3952

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

3953

++	if (bfqq_process_refs(bfqq) == 1) {

3954

++		bfqq->pid = current->pid;

3955

++		bfq_clear_bfqq_coop(bfqq);

3956

++		bfq_clear_bfqq_split_coop(bfqq);

3957

++		return bfqq;

3958

++	}

3959

++

3960

++	bic_set_bfqq(bic, NULL, 1);

3961

++

3962

++	bfq_put_cooperator(bfqq);

3963

++

3964

++	bfq_put_queue(bfqq);

3965

++	return NULL;

3966

++}

3967

++

3968

++/*

3969

++ * Allocate bfq data structures associated with this request.

3970

++ */

3971

++static int bfq_set_request(struct request_queue *q, struct request *rq,

3972

++			   struct bio *bio, gfp_t gfp_mask)

3973

++{

3974

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3975

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

3976

++	const int rw = rq_data_dir(rq);

3977

++	const int is_sync = rq_is_sync(rq);

3978

++	struct bfq_queue *bfqq;

3979

++	struct bfq_group *bfqg;

3980

++	unsigned long flags;

3981

++

3982

++	might_sleep_if(gfp_mask & __GFP_WAIT);

3983

++

3984

++	bfq_changed_ioprio(bic);

3985

++

3986

++	spin_lock_irqsave(q->queue_lock, flags);

3987

++

3988

++	if (bic == NULL)

3989

++		goto queue_fail;

3990

++

3991

++	bfqg = bfq_bic_update_cgroup(bic);

3992

++

3993

++new_queue:

3994

++	bfqq = bic_to_bfqq(bic, is_sync);

3995

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3996

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3997

++		bic_set_bfqq(bic, bfqq, is_sync);

3998

++	} else {

3999

++		/*

4000

++		 * If the queue was seeky for too long, break it apart.

4001

++		 */

4002

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4003

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4004

++			bfqq = bfq_split_bfqq(bic, bfqq);

4005

++			if (!bfqq)

4006

++				goto new_queue;

4007

++		}

4008

++

4009

++		/*

4010

++		 * Check to see if this queue is scheduled to merge with

4011

++		 * another closely cooperating queue. The merging of queues

4012

++		 * happens here as it must be done in process context.

4013

++		 * The reference on new_bfqq was taken in merge_bfqqs.

4014

++		 */

4015

++		if (bfqq->new_bfqq != NULL)

4016

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4017

++	}

4018

++

4019

++	bfqq->allocated[rw]++;

4020

++	atomic_inc(&bfqq->ref);

4021

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4022

++		     atomic_read(&bfqq->ref));

4023

++

4024

++	rq->elv.priv[0] = bic;

4025

++	rq->elv.priv[1] = bfqq;

4026

++

4027

++	spin_unlock_irqrestore(q->queue_lock, flags);

4028

++

4029

++	return 0;

4030

++

4031

++queue_fail:

4032

++	bfq_schedule_dispatch(bfqd);

4033

++	spin_unlock_irqrestore(q->queue_lock, flags);

4034

++

4035

++	return 1;

4036

++}

4037

++

4038

++static void bfq_kick_queue(struct work_struct *work)

4039

++{

4040

++	struct bfq_data *bfqd =

4041

++		container_of(work, struct bfq_data, unplug_work);

4042

++	struct request_queue *q = bfqd->queue;

4043

++

4044

++	spin_lock_irq(q->queue_lock);

4045

++	__blk_run_queue(q);

4046

++	spin_unlock_irq(q->queue_lock);

4047

++}

4048

++

4049

++/*

4050

++ * Handler of the expiration of the timer running if the in-service queue

4051

++ * is idling inside its time slice.

4052

++ */

4053

++static void bfq_idle_slice_timer(unsigned long data)

4054

++{

4055

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4056

++	struct bfq_queue *bfqq;

4057

++	unsigned long flags;

4058

++	enum bfqq_expiration reason;

4059

++

4060

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4061

++

4062

++	bfqq = bfqd->in_service_queue;

4063

++	/*

4064

++	 * Theoretical race here: the in-service queue can be NULL or different

4065

++	 * from the queue that was idling if the timer handler spins on

4066

++	 * the queue_lock and a new request arrives for the current

4067

++	 * queue and there is a full dispatch cycle that changes the

4068

++	 * in-service queue.  This can hardly happen, but in the worst case

4069

++	 * we just expire a queue too early.

4070

++	 */

4071

++	if (bfqq != NULL) {

4072

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4073

++		if (bfq_bfqq_budget_timeout(bfqq))

4074

++			/*

4075

++			 * Also here the queue can be safely expired

4076

++			 * for budget timeout without wasting

4077

++			 * guarantees

4078

++			 */

4079

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4080

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4081

++			/*

4082

++			 * The queue may not be empty upon timer expiration,

4083

++			 * because we may not disable the timer when the first

4084

++			 * request of the in-service queue arrives during

4085

++			 * disk idling

4086

++			 */

4087

++			reason = BFQ_BFQQ_TOO_IDLE;

4088

++		else

4089

++			goto schedule_dispatch;

4090

++

4091

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4092

++	}

4093

++

4094

++schedule_dispatch:

4095

++	bfq_schedule_dispatch(bfqd);

4096

++

4097

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4098

++}

4099

++

4100

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4101

++{

4102

++	del_timer_sync(&bfqd->idle_slice_timer);

4103

++	cancel_work_sync(&bfqd->unplug_work);

4104

++}

4105

++

4106

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4107

++					struct bfq_queue **bfqq_ptr)

4108

++{

4109

++	struct bfq_group *root_group = bfqd->root_group;

4110

++	struct bfq_queue *bfqq = *bfqq_ptr;

4111

++

4112

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4113

++	if (bfqq != NULL) {

4114

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4115

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4116

++			     bfqq, atomic_read(&bfqq->ref));

4117

++		bfq_put_queue(bfqq);

4118

++		*bfqq_ptr = NULL;

4119

++	}

4120

++}

4121

++

4122

++/*

4123

++ * Release all the bfqg references to its async queues.  If we are

4124

++ * deallocating the group these queues may still contain requests, so

4125

++ * we reparent them to the root cgroup (i.e., the only one that will

4126

++ * exist for sure untill all the requests on a device are gone).

4127

++ */

4128

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4129

++{

4130

++	int i, j;

4131

++

4132

++	for (i = 0; i < 2; i++)

4133

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4134

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4135

++

4136

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4137

++}

4138

++

4139

++static void bfq_exit_queue(struct elevator_queue *e)

4140

++{

4141

++	struct bfq_data *bfqd = e->elevator_data;

4142

++	struct request_queue *q = bfqd->queue;

4143

++	struct bfq_queue *bfqq, *n;

4144

++

4145

++	bfq_shutdown_timer_wq(bfqd);

4146

++

4147

++	spin_lock_irq(q->queue_lock);

4148

++

4149

++	BUG_ON(bfqd->in_service_queue != NULL);

4150

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4151

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4152

++

4153

++	bfq_disconnect_groups(bfqd);

4154

++	spin_unlock_irq(q->queue_lock);

4155

++

4156

++	bfq_shutdown_timer_wq(bfqd);

4157

++

4158

++	synchronize_rcu();

4159

++

4160

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4161

++

4162

++	bfq_free_root_group(bfqd);

4163

++	kfree(bfqd);

4164

++}

4165

++

4166

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4167

++{

4168

++	struct bfq_group *bfqg;

4169

++	struct bfq_data *bfqd;

4170

++	struct elevator_queue *eq;

4171

++

4172

++	eq = elevator_alloc(q, e);

4173

++	if (eq == NULL)

4174

++		return -ENOMEM;

4175

++

4176

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4177

++	if (bfqd == NULL) {

4178

++		kobject_put(&eq->kobj);

4179

++		return -ENOMEM;

4180

++	}

4181

++	eq->elevator_data = bfqd;

4182

++

4183

++	/*

4184

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4185

++	 * Grab a permanent reference to it, so that the normal code flow

4186

++	 * will not attempt to free it.

4187

++	 */

4188

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

4189

++	atomic_inc(&bfqd->oom_bfqq.ref);

4190

++

4191

++	bfqd->queue = q;

4192

++

4193

++	spin_lock_irq(q->queue_lock);

4194

++	q->elevator = eq;

4195

++	spin_unlock_irq(q->queue_lock);

4196

++

4197

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

4198

++	if (bfqg == NULL) {

4199

++		kfree(bfqd);

4200

++		kobject_put(&eq->kobj);

4201

++		return -ENOMEM;

4202

++	}

4203

++

4204

++	bfqd->root_group = bfqg;

4205

++

4206

++	init_timer(&bfqd->idle_slice_timer);

4207

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4208

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4209

++

4210

++	bfqd->rq_pos_tree = RB_ROOT;

4211

++

4212

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4213

++

4214

++	INIT_LIST_HEAD(&bfqd->active_list);

4215

++	INIT_LIST_HEAD(&bfqd->idle_list);

4216

++

4217

++	bfqd->hw_tag = -1;

4218

++

4219

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4220

++

4221

++	bfqd->bfq_quantum = bfq_quantum;

4222

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4223

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4224

++	bfqd->bfq_back_max = bfq_back_max;

4225

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4226

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4227

++	bfqd->bfq_class_idle_last_service = 0;

4228

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4229

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4230

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4231

++

4232

++	bfqd->low_latency = true;

4233

++

4234

++	bfqd->bfq_raising_coeff = 20;

4235

++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

4236

++	bfqd->bfq_raising_max_time = 0;

4237

++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

4238

++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

4239

++	bfqd->bfq_raising_max_softrt_rate = 7000; /*

4240

++						   * Approximate rate required

4241

++						   * to playback or record a

4242

++						   * high-definition compressed

4243

++						   * video.

4244

++						   */

4245

++	bfqd->raised_busy_queues = 0;

4246

++

4247

++	/* Initially estimate the device's peak rate as the reference rate */

4248

++	if (blk_queue_nonrot(bfqd->queue)) {

4249

++		bfqd->RT_prod = R_nonrot * T_nonrot;

4250

++		bfqd->peak_rate = R_nonrot;

4251

++	} else {

4252

++		bfqd->RT_prod = R_rot * T_rot;

4253

++		bfqd->peak_rate = R_rot;

4254

++	}

4255

++

4256

++	return 0;

4257

++}

4258

++

4259

++static void bfq_slab_kill(void)

4260

++{

4261

++	if (bfq_pool != NULL)

4262

++		kmem_cache_destroy(bfq_pool);

4263

++}

4264

++

4265

++static int __init bfq_slab_setup(void)

4266

++{

4267

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4268

++	if (bfq_pool == NULL)

4269

++		return -ENOMEM;

4270

++	return 0;

4271

++}

4272

++

4273

++static ssize_t bfq_var_show(unsigned int var, char *page)

4274

++{

4275

++	return sprintf(page, "%d\n", var);

4276

++}

4277

++

4278

++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

4279

++{

4280

++	unsigned long new_val;

4281

++	int ret = kstrtoul(page, 10, &new_val);

4282

++

4283

++	if (ret == 0)

4284

++		*var = new_val;

4285

++

4286

++	return count;

4287

++}

4288

++

4289

++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

4290

++{

4291

++	struct bfq_data *bfqd = e->elevator_data;

4292

++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

4293

++		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

4294

++		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

4295

++}

4296

++

4297

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4298

++{

4299

++	struct bfq_queue *bfqq;

4300

++	struct bfq_data *bfqd = e->elevator_data;

4301

++	ssize_t num_char = 0;

4302

++

4303

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4304

++			    bfqd->queued);

4305

++

4306

++	spin_lock_irq(bfqd->queue->queue_lock);

4307

++

4308

++	num_char += sprintf(page + num_char, "Active:\n");

4309

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4310

++	  num_char += sprintf(page + num_char,

4311

++			      "pid%d: weight %hu, nr_queued %d %d,"

4312

++			      " dur %d/%u\n",

4313

++			      bfqq->pid,

4314

++			      bfqq->entity.weight,

4315

++			      bfqq->queued[0],

4316

++			      bfqq->queued[1],

4317

++			jiffies_to_msecs(jiffies -

4318

++				bfqq->last_rais_start_finish),

4319

++			jiffies_to_msecs(bfqq->raising_cur_max_time));

4320

++	}

4321

++

4322

++	num_char += sprintf(page + num_char, "Idle:\n");

4323

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4324

++			num_char += sprintf(page + num_char,

4325

++				"pid%d: weight %hu, dur %d/%u\n",

4326

++				bfqq->pid,

4327

++				bfqq->entity.weight,

4328

++				jiffies_to_msecs(jiffies -

4329

++					bfqq->last_rais_start_finish),

4330

++				jiffies_to_msecs(bfqq->raising_cur_max_time));

4331

++	}

4332

++

4333

++	spin_unlock_irq(bfqd->queue->queue_lock);

4334

++

4335

++	return num_char;

4336

++}

4337

++

4338

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4339

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4340

++{									\

4341

++	struct bfq_data *bfqd = e->elevator_data;			\

4342

++	unsigned int __data = __VAR;					\

4343

++	if (__CONV)							\

4344

++		__data = jiffies_to_msecs(__data);			\

4345

++	return bfq_var_show(__data, (page));				\

4346

++}

4347

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4348

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4349

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4350

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4351

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4352

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4353

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4354

++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

4355

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4356

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4357

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4358

++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

4359

++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

4360

++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

4361

++	1);

4362

++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

4363

++	bfqd->bfq_raising_min_inter_arr_async,

4364

++	1);

4365

++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

4366

++	bfqd->bfq_raising_max_softrt_rate, 0);

4367

++#undef SHOW_FUNCTION

4368

++

4369

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4370

++static ssize_t								\

4371

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4372

++{									\

4373

++	struct bfq_data *bfqd = e->elevator_data;			\

4374

++	unsigned long uninitialized_var(__data);			\

4375

++	int ret = bfq_var_store(&__data, (page), count);		\

4376

++	if (__data < (MIN))						\

4377

++		__data = (MIN);						\

4378

++	else if (__data > (MAX))					\

4379

++		__data = (MAX);						\

4380

++	if (__CONV)							\

4381

++		*(__PTR) = msecs_to_jiffies(__data);			\

4382

++	else								\

4383

++		*(__PTR) = __data;					\

4384

++	return ret;							\

4385

++}

4386

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4387

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4388

++		INT_MAX, 1);

4389

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4390

++		INT_MAX, 1);

4391

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4392

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4393

++		INT_MAX, 0);

4394

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4395

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4396

++		1, INT_MAX, 0);

4397

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4398

++		INT_MAX, 1);

4399

++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

4400

++		INT_MAX, 0);

4401

++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

4402

++		INT_MAX, 1);

4403

++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

4404

++		INT_MAX, 1);

4405

++STORE_FUNCTION(bfq_raising_min_idle_time_store,

4406

++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

4407

++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

4408

++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

4409

++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

4410

++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

4411

++#undef STORE_FUNCTION

4412

++

4413

++/* do nothing for the moment */

4414

++static ssize_t bfq_weights_store(struct elevator_queue *e,

4415

++				    const char *page, size_t count)

4416

++{

4417

++	return count;

4418

++}

4419

++

4420

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4421

++{

4422

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4423

++

4424

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4425

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4426

++	else

4427

++		return bfq_default_max_budget;

4428

++}

4429

++

4430

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4431

++				    const char *page, size_t count)

4432

++{

4433

++	struct bfq_data *bfqd = e->elevator_data;

4434

++	unsigned long uninitialized_var(__data);

4435

++	int ret = bfq_var_store(&__data, (page), count);

4436

++

4437

++	if (__data == 0)

4438

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4439

++	else {

4440

++		if (__data > INT_MAX)

4441

++			__data = INT_MAX;

4442

++		bfqd->bfq_max_budget = __data;

4443

++	}

4444

++

4445

++	bfqd->bfq_user_max_budget = __data;

4446

++

4447

++	return ret;

4448

++}

4449

++

4450

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4451

++				      const char *page, size_t count)

4452

++{

4453

++	struct bfq_data *bfqd = e->elevator_data;

4454

++	unsigned long uninitialized_var(__data);

4455

++	int ret = bfq_var_store(&__data, (page), count);

4456

++

4457

++	if (__data < 1)

4458

++		__data = 1;

4459

++	else if (__data > INT_MAX)

4460

++		__data = INT_MAX;

4461

++

4462

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4463

++	if (bfqd->bfq_user_max_budget == 0)

4464

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4465

++

4466

++	return ret;

4467

++}

4468

++

4469

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

4470

++				     const char *page, size_t count)

4471

++{

4472

++	struct bfq_data *bfqd = e->elevator_data;

4473

++	unsigned long uninitialized_var(__data);

4474

++	int ret = bfq_var_store(&__data, (page), count);

4475

++

4476

++	if (__data > 1)

4477

++		__data = 1;

4478

++	if (__data == 0 && bfqd->low_latency != 0)

4479

++		bfq_end_raising(bfqd);

4480

++	bfqd->low_latency = __data;

4481

++

4482

++	return ret;

4483

++}

4484

++

4485

++#define BFQ_ATTR(name) \

4486

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

4487

++

4488

++static struct elv_fs_entry bfq_attrs[] = {

4489

++	BFQ_ATTR(quantum),

4490

++	BFQ_ATTR(fifo_expire_sync),

4491

++	BFQ_ATTR(fifo_expire_async),

4492

++	BFQ_ATTR(back_seek_max),

4493

++	BFQ_ATTR(back_seek_penalty),

4494

++	BFQ_ATTR(slice_idle),

4495

++	BFQ_ATTR(max_budget),

4496

++	BFQ_ATTR(max_budget_async_rq),

4497

++	BFQ_ATTR(timeout_sync),

4498

++	BFQ_ATTR(timeout_async),

4499

++	BFQ_ATTR(low_latency),

4500

++	BFQ_ATTR(raising_coeff),

4501

++	BFQ_ATTR(raising_max_time),

4502

++	BFQ_ATTR(raising_rt_max_time),

4503

++	BFQ_ATTR(raising_min_idle_time),

4504

++	BFQ_ATTR(raising_min_inter_arr_async),

4505

++	BFQ_ATTR(raising_max_softrt_rate),

4506

++	BFQ_ATTR(weights),

4507

++	__ATTR_NULL

4508

++};

4509

++

4510

++static struct elevator_type iosched_bfq = {

4511

++	.ops = {

4512

++		.elevator_merge_fn =		bfq_merge,

4513

++		.elevator_merged_fn =		bfq_merged_request,

4514

++		.elevator_merge_req_fn =	bfq_merged_requests,

4515

++		.elevator_allow_merge_fn =	bfq_allow_merge,

4516

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

4517

++		.elevator_add_req_fn =		bfq_insert_request,

4518

++		.elevator_activate_req_fn =	bfq_activate_request,

4519

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

4520

++		.elevator_completed_req_fn =	bfq_completed_request,

4521

++		.elevator_former_req_fn =	elv_rb_former_request,

4522

++		.elevator_latter_req_fn =	elv_rb_latter_request,

4523

++		.elevator_init_icq_fn =		bfq_init_icq,

4524

++		.elevator_exit_icq_fn =		bfq_exit_icq,

4525

++		.elevator_set_req_fn =		bfq_set_request,

4526

++		.elevator_put_req_fn =		bfq_put_request,

4527

++		.elevator_may_queue_fn =	bfq_may_queue,

4528

++		.elevator_init_fn =		bfq_init_queue,

4529

++		.elevator_exit_fn =		bfq_exit_queue,

4530

++	},

4531

++	.icq_size =		sizeof(struct bfq_io_cq),

4532

++	.icq_align =		__alignof__(struct bfq_io_cq),

4533

++	.elevator_attrs =	bfq_attrs,

4534

++	.elevator_name =	"bfq",

4535

++	.elevator_owner =	THIS_MODULE,

4536

++};

4537

++

4538

++static int __init bfq_init(void)

4539

++{

4540

++	/*

4541

++	 * Can be 0 on HZ < 1000 setups.

4542

++	 */

4543

++	if (bfq_slice_idle == 0)

4544

++		bfq_slice_idle = 1;

4545

++

4546

++	if (bfq_timeout_async == 0)

4547

++		bfq_timeout_async = 1;

4548

++

4549

++	if (bfq_slab_setup())

4550

++		return -ENOMEM;

4551

++

4552

++	elv_register(&iosched_bfq);

4553

++	printk(KERN_INFO "BFQ I/O-scheduler version: v7");

4554

++

4555

++	return 0;

4556

++}

4557

++

4558

++static void __exit bfq_exit(void)

4559

++{

4560

++	elv_unregister(&iosched_bfq);

4561

++	bfq_slab_kill();

4562

++}

4563

++

4564

++module_init(bfq_init);

4565

++module_exit(bfq_exit);

4566

++

4567

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

4568

++MODULE_LICENSE("GPL");

4569

++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

4570

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

4571

+new file mode 100644

4572

+index 0000000..30df81c

4573

+--- /dev/null

4574

++++ b/block/bfq-sched.c

4575

+@@ -0,0 +1,1077 @@

4576

++/*

4577

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

4578

++ *

4579

++ * Based on ideas and code from CFQ:

4580

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

4581

++ *

4582

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

4583

++ *		      Paolo Valente <paolo.valente@×××××××.it>

4584

++ *

4585

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

4586

++ */

4587

++

4588

++#ifdef CONFIG_CGROUP_BFQIO

4589

++#define for_each_entity(entity)	\

4590

++	for (; entity != NULL; entity = entity->parent)

4591

++

4592

++#define for_each_entity_safe(entity, parent) \

4593

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

4594

++

4595

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

4596

++						 int extract,

4597

++						 struct bfq_data *bfqd);

4598

++

4599

++static inline void bfq_update_budget(struct bfq_entity *next_active)

4600

++{

4601

++	struct bfq_entity *bfqg_entity;

4602

++	struct bfq_group *bfqg;

4603

++	struct bfq_sched_data *group_sd;

4604

++

4605

++	BUG_ON(next_active == NULL);

4606

++

4607

++	group_sd = next_active->sched_data;

4608

++

4609

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

4610

++	/*

4611

++	 * bfq_group's my_entity field is not NULL only if the group

4612

++	 * is not the root group. We must not touch the root entity

4613

++	 * as it must never become an active entity.

4614

++	 */

4615

++	bfqg_entity = bfqg->my_entity;

4616

++	if (bfqg_entity != NULL)

4617

++		bfqg_entity->budget = next_active->budget;

4618

++}

4619

++

4620

++static int bfq_update_next_active(struct bfq_sched_data *sd)

4621

++{

4622

++	struct bfq_entity *next_active;

4623

++

4624

++	if (sd->active_entity != NULL)

4625

++		/* will update/requeue at the end of service */

4626

++		return 0;

4627

++

4628

++	/*

4629

++	 * NOTE: this can be improved in many ways, such as returning

4630

++	 * 1 (and thus propagating upwards the update) only when the

4631

++	 * budget changes, or caching the bfqq that will be scheduled

4632

++	 * next from this subtree.  By now we worry more about

4633

++	 * correctness than about performance...

4634

++	 */

4635

++	next_active = bfq_lookup_next_entity(sd, 0, NULL);

4636

++	sd->next_active = next_active;

4637

++

4638

++	if (next_active != NULL)

4639

++		bfq_update_budget(next_active);

4640

++

4641

++	return 1;

4642

++}

4643

++

4644

++static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4645

++					 struct bfq_entity *entity)

4646

++{

4647

++	BUG_ON(sd->next_active != entity);

4648

++}

4649

++#else

4650

++#define for_each_entity(entity)	\

4651

++	for (; entity != NULL; entity = NULL)

4652

++

4653

++#define for_each_entity_safe(entity, parent) \

4654

++	for (parent = NULL; entity != NULL; entity = parent)

4655

++

4656

++static inline int bfq_update_next_active(struct bfq_sched_data *sd)

4657

++{

4658

++	return 0;

4659

++}

4660

++

4661

++static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4662

++					 struct bfq_entity *entity)

4663

++{

4664

++}

4665

++

4666

++static inline void bfq_update_budget(struct bfq_entity *next_active)

4667

++{

4668

++}

4669

++#endif

4670

++

4671

++/*

4672

++ * Shift for timestamp calculations.  This actually limits the maximum

4673

++ * service allowed in one timestamp delta (small shift values increase it),

4674

++ * the maximum total weight that can be used for the queues in the system

4675

++ * (big shift values increase it), and the period of virtual time wraparounds.

4676

++ */

4677

++#define WFQ_SERVICE_SHIFT	22

4678

++

4679

++/**

4680

++ * bfq_gt - compare two timestamps.

4681

++ * @a: first ts.

4682

++ * @b: second ts.

4683

++ *

4684

++ * Return @a > @b, dealing with wrapping correctly.

4685

++ */

4686

++static inline int bfq_gt(u64 a, u64 b)

4687

++{

4688

++	return (s64)(a - b) > 0;

4689

++}

4690

++

4691

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

4692

++{

4693

++	struct bfq_queue *bfqq = NULL;

4694

++

4695

++	BUG_ON(entity == NULL);

4696

++

4697

++	if (entity->my_sched_data == NULL)

4698

++		bfqq = container_of(entity, struct bfq_queue, entity);

4699

++

4700

++	return bfqq;

4701

++}

4702

++

4703

++

4704

++/**

4705

++ * bfq_delta - map service into the virtual time domain.

4706

++ * @service: amount of service.

4707

++ * @weight: scale factor (weight of an entity or weight sum).

4708

++ */

4709

++static inline u64 bfq_delta(unsigned long service,

4710

++					unsigned long weight)

4711

++{

4712

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

4713

++

4714

++	do_div(d, weight);

4715

++	return d;

4716

++}

4717

++

4718

++/**

4719

++ * bfq_calc_finish - assign the finish time to an entity.

4720

++ * @entity: the entity to act upon.

4721

++ * @service: the service to be charged to the entity.

4722

++ */

4723

++static inline void bfq_calc_finish(struct bfq_entity *entity,

4724

++				   unsigned long service)

4725

++{

4726

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4727

++

4728

++	BUG_ON(entity->weight == 0);

4729

++

4730

++	entity->finish = entity->start +

4731

++		bfq_delta(service, entity->weight);

4732

++

4733

++	if (bfqq != NULL) {

4734

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

4735

++			"calc_finish: serv %lu, w %d",

4736

++			service, entity->weight);

4737

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

4738

++			"calc_finish: start %llu, finish %llu, delta %llu",

4739

++			entity->start, entity->finish,

4740

++			bfq_delta(service, entity->weight));

4741

++	}

4742

++}

4743

++

4744

++/**

4745

++ * bfq_entity_of - get an entity from a node.

4746

++ * @node: the node field of the entity.

4747

++ *

4748

++ * Convert a node pointer to the relative entity.  This is used only

4749

++ * to simplify the logic of some functions and not as the generic

4750

++ * conversion mechanism because, e.g., in the tree walking functions,

4751

++ * the check for a %NULL value would be redundant.

4752

++ */

4753

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

4754

++{

4755

++	struct bfq_entity *entity = NULL;

4756

++

4757

++	if (node != NULL)

4758

++		entity = rb_entry(node, struct bfq_entity, rb_node);

4759

++

4760

++	return entity;

4761

++}

4762

++

4763

++/**

4764

++ * bfq_extract - remove an entity from a tree.

4765

++ * @root: the tree root.

4766

++ * @entity: the entity to remove.

4767

++ */

4768

++static inline void bfq_extract(struct rb_root *root,

4769

++			       struct bfq_entity *entity)

4770

++{

4771

++	BUG_ON(entity->tree != root);

4772

++

4773

++	entity->tree = NULL;

4774

++	rb_erase(&entity->rb_node, root);

4775

++}

4776

++

4777

++/**

4778

++ * bfq_idle_extract - extract an entity from the idle tree.

4779

++ * @st: the service tree of the owning @entity.

4780

++ * @entity: the entity being removed.

4781

++ */

4782

++static void bfq_idle_extract(struct bfq_service_tree *st,

4783

++			     struct bfq_entity *entity)

4784

++{

4785

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4786

++	struct rb_node *next;

4787

++

4788

++	BUG_ON(entity->tree != &st->idle);

4789

++

4790

++	if (entity == st->first_idle) {

4791

++		next = rb_next(&entity->rb_node);

4792

++		st->first_idle = bfq_entity_of(next);

4793

++	}

4794

++

4795

++	if (entity == st->last_idle) {

4796

++		next = rb_prev(&entity->rb_node);

4797

++		st->last_idle = bfq_entity_of(next);

4798

++	}

4799

++

4800

++	bfq_extract(&st->idle, entity);

4801

++

4802

++	if (bfqq != NULL)

4803

++		list_del(&bfqq->bfqq_list);

4804

++}

4805

++

4806

++/**

4807

++ * bfq_insert - generic tree insertion.

4808

++ * @root: tree root.

4809

++ * @entity: entity to insert.

4810

++ *

4811

++ * This is used for the idle and the active tree, since they are both

4812

++ * ordered by finish time.

4813

++ */

4814

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

4815

++{

4816

++	struct bfq_entity *entry;

4817

++	struct rb_node **node = &root->rb_node;

4818

++	struct rb_node *parent = NULL;

4819

++

4820

++	BUG_ON(entity->tree != NULL);

4821

++

4822

++	while (*node != NULL) {

4823

++		parent = *node;

4824

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

4825

++

4826

++		if (bfq_gt(entry->finish, entity->finish))

4827

++			node = &parent->rb_left;

4828

++		else

4829

++			node = &parent->rb_right;

4830

++	}

4831

++

4832

++	rb_link_node(&entity->rb_node, parent, node);

4833

++	rb_insert_color(&entity->rb_node, root);

4834

++

4835

++	entity->tree = root;

4836

++}

4837

++

4838

++/**

4839

++ * bfq_update_min - update the min_start field of a entity.

4840

++ * @entity: the entity to update.

4841

++ * @node: one of its children.

4842

++ *

4843

++ * This function is called when @entity may store an invalid value for

4844

++ * min_start due to updates to the active tree.  The function  assumes

4845

++ * that the subtree rooted at @node (which may be its left or its right

4846

++ * child) has a valid min_start value.

4847

++ */

4848

++static inline void bfq_update_min(struct bfq_entity *entity,

4849

++				  struct rb_node *node)

4850

++{

4851

++	struct bfq_entity *child;

4852

++

4853

++	if (node != NULL) {

4854

++		child = rb_entry(node, struct bfq_entity, rb_node);

4855

++		if (bfq_gt(entity->min_start, child->min_start))

4856

++			entity->min_start = child->min_start;

4857

++	}

4858

++}

4859

++

4860

++/**

4861

++ * bfq_update_active_node - recalculate min_start.

4862

++ * @node: the node to update.

4863

++ *

4864

++ * @node may have changed position or one of its children may have moved,

4865

++ * this function updates its min_start value.  The left and right subtrees

4866

++ * are assumed to hold a correct min_start value.

4867

++ */

4868

++static inline void bfq_update_active_node(struct rb_node *node)

4869

++{

4870

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

4871

++

4872

++	entity->min_start = entity->start;

4873

++	bfq_update_min(entity, node->rb_right);

4874

++	bfq_update_min(entity, node->rb_left);

4875

++}

4876

++

4877

++/**

4878

++ * bfq_update_active_tree - update min_start for the whole active tree.

4879

++ * @node: the starting node.

4880

++ *

4881

++ * @node must be the deepest modified node after an update.  This function

4882

++ * updates its min_start using the values held by its children, assuming

4883

++ * that they did not change, and then updates all the nodes that may have

4884

++ * changed in the path to the root.  The only nodes that may have changed

4885

++ * are the ones in the path or their siblings.

4886

++ */

4887

++static void bfq_update_active_tree(struct rb_node *node)

4888

++{

4889

++	struct rb_node *parent;

4890

++

4891

++up:

4892

++	bfq_update_active_node(node);

4893

++

4894

++	parent = rb_parent(node);

4895

++	if (parent == NULL)

4896

++		return;

4897

++

4898

++	if (node == parent->rb_left && parent->rb_right != NULL)

4899

++		bfq_update_active_node(parent->rb_right);

4900

++	else if (parent->rb_left != NULL)

4901

++		bfq_update_active_node(parent->rb_left);

4902

++

4903

++	node = parent;

4904

++	goto up;

4905

++}

4906

++

4907

++/**

4908

++ * bfq_active_insert - insert an entity in the active tree of its group/device.

4909

++ * @st: the service tree of the entity.

4910

++ * @entity: the entity being inserted.

4911

++ *

4912

++ * The active tree is ordered by finish time, but an extra key is kept

4913

++ * per each node, containing the minimum value for the start times of

4914

++ * its children (and the node itself), so it's possible to search for

4915

++ * the eligible node with the lowest finish time in logarithmic time.

4916

++ */

4917

++static void bfq_active_insert(struct bfq_service_tree *st,

4918

++			      struct bfq_entity *entity)

4919

++{

4920

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4921

++	struct rb_node *node = &entity->rb_node;

4922

++

4923

++	bfq_insert(&st->active, entity);

4924

++

4925

++	if (node->rb_left != NULL)

4926

++		node = node->rb_left;

4927

++	else if (node->rb_right != NULL)

4928

++		node = node->rb_right;

4929

++

4930

++	bfq_update_active_tree(node);

4931

++

4932

++	if (bfqq != NULL)

4933

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

4934

++}

4935

++

4936

++/**

4937

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

4938

++ * @ioprio: the ioprio value to convert.

4939

++ */

4940

++static unsigned short bfq_ioprio_to_weight(int ioprio)

4941

++{

4942

++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

4943

++	return IOPRIO_BE_NR - ioprio;

4944

++}

4945

++

4946

++/**

4947

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

4948

++ * @weight: the weight value to convert.

4949

++ *

4950

++ * To preserve as mush as possible the old only-ioprio user interface,

4951

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

4952

++ * larger than IOPRIO_BE_NR

4953

++ */

4954

++static unsigned short bfq_weight_to_ioprio(int weight)

4955

++{

4956

++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

4957

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

4958

++}

4959

++

4960

++static inline void bfq_get_entity(struct bfq_entity *entity)

4961

++{

4962

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4963

++	struct bfq_sched_data *sd;

4964

++

4965

++	if (bfqq != NULL) {

4966

++		sd = entity->sched_data;

4967

++		atomic_inc(&bfqq->ref);

4968

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

4969

++			     bfqq, atomic_read(&bfqq->ref));

4970

++	}

4971

++}

4972

++

4973

++/**

4974

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

4975

++ * @node: the node being removed.

4976

++ *

4977

++ * Do the first step of an extraction in an rb tree, looking for the

4978

++ * node that will replace @node, and returning the deepest node that

4979

++ * the following modifications to the tree can touch.  If @node is the

4980

++ * last node in the tree return %NULL.

4981

++ */

4982

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

4983

++{

4984

++	struct rb_node *deepest;

4985

++

4986

++	if (node->rb_right == NULL && node->rb_left == NULL)

4987

++		deepest = rb_parent(node);

4988

++	else if (node->rb_right == NULL)

4989

++		deepest = node->rb_left;

4990

++	else if (node->rb_left == NULL)

4991

++		deepest = node->rb_right;

4992

++	else {

4993

++		deepest = rb_next(node);

4994

++		if (deepest->rb_right != NULL)

4995

++			deepest = deepest->rb_right;

4996

++		else if (rb_parent(deepest) != node)

4997

++			deepest = rb_parent(deepest);

4998

++	}

4999

++

5000

++	return deepest;

5001

++}

5002

++

5003

++/**

5004

++ * bfq_active_extract - remove an entity from the active tree.

5005

++ * @st: the service_tree containing the tree.

5006

++ * @entity: the entity being removed.

5007

++ */

5008

++static void bfq_active_extract(struct bfq_service_tree *st,

5009

++			       struct bfq_entity *entity)

5010

++{

5011

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5012

++	struct rb_node *node;

5013

++

5014

++	node = bfq_find_deepest(&entity->rb_node);

5015

++	bfq_extract(&st->active, entity);

5016

++

5017

++	if (node != NULL)

5018

++		bfq_update_active_tree(node);

5019

++

5020

++	if (bfqq != NULL)

5021

++		list_del(&bfqq->bfqq_list);

5022

++}

5023

++

5024

++/**

5025

++ * bfq_idle_insert - insert an entity into the idle tree.

5026

++ * @st: the service tree containing the tree.

5027

++ * @entity: the entity to insert.

5028

++ */

5029

++static void bfq_idle_insert(struct bfq_service_tree *st,

5030

++			    struct bfq_entity *entity)

5031

++{

5032

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5033

++	struct bfq_entity *first_idle = st->first_idle;

5034

++	struct bfq_entity *last_idle = st->last_idle;

5035

++

5036

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5037

++		st->first_idle = entity;

5038

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5039

++		st->last_idle = entity;

5040

++

5041

++	bfq_insert(&st->idle, entity);

5042

++

5043

++	if (bfqq != NULL)

5044

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5045

++}

5046

++

5047

++/**

5048

++ * bfq_forget_entity - remove an entity from the wfq trees.

5049

++ * @st: the service tree.

5050

++ * @entity: the entity being removed.

5051

++ *

5052

++ * Update the device status and forget everything about @entity, putting

5053

++ * the device reference to it, if it is a queue.  Entities belonging to

5054

++ * groups are not refcounted.

5055

++ */

5056

++static void bfq_forget_entity(struct bfq_service_tree *st,

5057

++			      struct bfq_entity *entity)

5058

++{

5059

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5060

++	struct bfq_sched_data *sd;

5061

++

5062

++	BUG_ON(!entity->on_st);

5063

++

5064

++	entity->on_st = 0;

5065

++	st->wsum -= entity->weight;

5066

++	if (bfqq != NULL) {

5067

++		sd = entity->sched_data;

5068

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5069

++			     bfqq, atomic_read(&bfqq->ref));

5070

++		bfq_put_queue(bfqq);

5071

++	}

5072

++}

5073

++

5074

++/**

5075

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5076

++ * @st: service tree for the entity.

5077

++ * @entity: the entity being released.

5078

++ */

5079

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5080

++				struct bfq_entity *entity)

5081

++{

5082

++	bfq_idle_extract(st, entity);

5083

++	bfq_forget_entity(st, entity);

5084

++}

5085

++

5086

++/**

5087

++ * bfq_forget_idle - update the idle tree if necessary.

5088

++ * @st: the service tree to act upon.

5089

++ *

5090

++ * To preserve the global O(log N) complexity we only remove one entry here;

5091

++ * as the idle tree will not grow indefinitely this can be done safely.

5092

++ */

5093

++static void bfq_forget_idle(struct bfq_service_tree *st)

5094

++{

5095

++	struct bfq_entity *first_idle = st->first_idle;

5096

++	struct bfq_entity *last_idle = st->last_idle;

5097

++

5098

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5099

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5100

++		/*

5101

++		 * Forget the whole idle tree, increasing the vtime past

5102

++		 * the last finish time of idle entities.

5103

++		 */

5104

++		st->vtime = last_idle->finish;

5105

++	}

5106

++

5107

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5108

++		bfq_put_idle_entity(st, first_idle);

5109

++}

5110

++

5111

++static struct bfq_service_tree *

5112

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5113

++			 struct bfq_entity *entity)

5114

++{

5115

++	struct bfq_service_tree *new_st = old_st;

5116

++

5117

++	if (entity->ioprio_changed) {

5118

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5119

++

5120

++		BUG_ON(old_st->wsum < entity->weight);

5121

++		old_st->wsum -= entity->weight;

5122

++

5123

++		if (entity->new_weight != entity->orig_weight) {

5124

++			entity->orig_weight = entity->new_weight;

5125

++			entity->ioprio =

5126

++				bfq_weight_to_ioprio(entity->orig_weight);

5127

++		} else if (entity->new_ioprio != entity->ioprio) {

5128

++			entity->ioprio = entity->new_ioprio;

5129

++			entity->orig_weight =

5130

++					bfq_ioprio_to_weight(entity->ioprio);

5131

++		} else

5132

++			entity->new_weight = entity->orig_weight =

5133

++				bfq_ioprio_to_weight(entity->ioprio);

5134

++

5135

++		entity->ioprio_class = entity->new_ioprio_class;

5136

++		entity->ioprio_changed = 0;

5137

++

5138

++		/*

5139

++		 * NOTE: here we may be changing the weight too early,

5140

++		 * this will cause unfairness.  The correct approach

5141

++		 * would have required additional complexity to defer

5142

++		 * weight changes to the proper time instants (i.e.,

5143

++		 * when entity->finish <= old_st->vtime).

5144

++		 */

5145

++		new_st = bfq_entity_service_tree(entity);

5146

++		entity->weight = entity->orig_weight *

5147

++			(bfqq != NULL ? bfqq->raising_coeff : 1);

5148

++		new_st->wsum += entity->weight;

5149

++

5150

++		if (new_st != old_st)

5151

++			entity->start = new_st->vtime;

5152

++	}

5153

++

5154

++	return new_st;

5155

++}

5156

++

5157

++/**

5158

++ * bfq_bfqq_served - update the scheduler status after selection for service.

5159

++ * @bfqq: the queue being served.

5160

++ * @served: bytes to transfer.

5161

++ *

5162

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5163

++ * are synchronized every time a new bfqq is selected for service.  By now,

5164

++ * we keep it to better check consistency.

5165

++ */

5166

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5167

++{

5168

++	struct bfq_entity *entity = &bfqq->entity;

5169

++	struct bfq_service_tree *st;

5170

++

5171

++	for_each_entity(entity) {

5172

++		st = bfq_entity_service_tree(entity);

5173

++

5174

++		entity->service += served;

5175

++		BUG_ON(entity->service > entity->budget);

5176

++		BUG_ON(st->wsum == 0);

5177

++

5178

++		st->vtime += bfq_delta(served, st->wsum);

5179

++		bfq_forget_idle(st);

5180

++	}

5181

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5182

++}

5183

++

5184

++/**

5185

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5186

++ * @bfqq: the queue that needs a service update.

5187

++ *

5188

++ * When it's not possible to be fair in the service domain, because

5189

++ * a queue is not consuming its budget fast enough (the meaning of

5190

++ * fast depends on the timeout parameter), we charge it a full

5191

++ * budget.  In this way we should obtain a sort of time-domain

5192

++ * fairness among all the seeky/slow queues.

5193

++ */

5194

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5195

++{

5196

++	struct bfq_entity *entity = &bfqq->entity;

5197

++

5198

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5199

++

5200

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5201

++}

5202

++

5203

++/**

5204

++ * __bfq_activate_entity - activate an entity.

5205

++ * @entity: the entity being activated.

5206

++ *

5207

++ * Called whenever an entity is activated, i.e., it is not active and one

5208

++ * of its children receives a new request, or has to be reactivated due to

5209

++ * budget exhaustion.  It uses the current budget of the entity (and the

5210

++ * service received if @entity is active) of the queue to calculate its

5211

++ * timestamps.

5212

++ */

5213

++static void __bfq_activate_entity(struct bfq_entity *entity)

5214

++{

5215

++	struct bfq_sched_data *sd = entity->sched_data;

5216

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5217

++

5218

++	if (entity == sd->active_entity) {

5219

++		BUG_ON(entity->tree != NULL);

5220

++		/*

5221

++		 * If we are requeueing the current entity we have

5222

++		 * to take care of not charging to it service it has

5223

++		 * not received.

5224

++		 */

5225

++		bfq_calc_finish(entity, entity->service);

5226

++		entity->start = entity->finish;

5227

++		sd->active_entity = NULL;

5228

++	} else if (entity->tree == &st->active) {

5229

++		/*

5230

++		 * Requeueing an entity due to a change of some

5231

++		 * next_active entity below it.  We reuse the old

5232

++		 * start time.

5233

++		 */

5234

++		bfq_active_extract(st, entity);

5235

++	} else if (entity->tree == &st->idle) {

5236

++		/*

5237

++		 * Must be on the idle tree, bfq_idle_extract() will

5238

++		 * check for that.

5239

++		 */

5240

++		bfq_idle_extract(st, entity);

5241

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

5242

++				       st->vtime : entity->finish;

5243

++	} else {

5244

++		/*

5245

++		 * The finish time of the entity may be invalid, and

5246

++		 * it is in the past for sure, otherwise the queue

5247

++		 * would have been on the idle tree.

5248

++		 */

5249

++		entity->start = st->vtime;

5250

++		st->wsum += entity->weight;

5251

++		bfq_get_entity(entity);

5252

++

5253

++		BUG_ON(entity->on_st);

5254

++		entity->on_st = 1;

5255

++	}

5256

++

5257

++	st = __bfq_entity_update_weight_prio(st, entity);

5258

++	bfq_calc_finish(entity, entity->budget);

5259

++	bfq_active_insert(st, entity);

5260

++}

5261

++

5262

++/**

5263

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5264

++ * @entity: the entity to activate.

5265

++ *

5266

++ * Activate @entity and all the entities on the path from it to the root.

5267

++ */

5268

++static void bfq_activate_entity(struct bfq_entity *entity)

5269

++{

5270

++	struct bfq_sched_data *sd;

5271

++

5272

++	for_each_entity(entity) {

5273

++		__bfq_activate_entity(entity);

5274

++

5275

++		sd = entity->sched_data;

5276

++		if (!bfq_update_next_active(sd))

5277

++			/*

5278

++			 * No need to propagate the activation to the

5279

++			 * upper entities, as they will be updated when

5280

++			 * the active entity is rescheduled.

5281

++			 */

5282

++			break;

5283

++	}

5284

++}

5285

++

5286

++/**

5287

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5288

++ * @entity: the entity to deactivate.

5289

++ * @requeue: if false, the entity will not be put into the idle tree.

5290

++ *

5291

++ * Deactivate an entity, independently from its previous state.  If the

5292

++ * entity was not on a service tree just return, otherwise if it is on

5293

++ * any scheduler tree, extract it from that tree, and if necessary

5294

++ * and if the caller did not specify @requeue, put it on the idle tree.

5295

++ *

5296

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

5297

++ * if the entity was under service or if it was the next_active for

5298

++ * its sched_data; return %0 otherwise.

5299

++ */

5300

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5301

++{

5302

++	struct bfq_sched_data *sd = entity->sched_data;

5303

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5304

++	int was_active = entity == sd->active_entity;

5305

++	int ret = 0;

5306

++

5307

++	if (!entity->on_st)

5308

++		return 0;

5309

++

5310

++	BUG_ON(was_active && entity->tree != NULL);

5311

++

5312

++	if (was_active) {

5313

++		bfq_calc_finish(entity, entity->service);

5314

++		sd->active_entity = NULL;

5315

++	} else if (entity->tree == &st->active)

5316

++		bfq_active_extract(st, entity);

5317

++	else if (entity->tree == &st->idle)

5318

++		bfq_idle_extract(st, entity);

5319

++	else if (entity->tree != NULL)

5320

++		BUG();

5321

++

5322

++	if (was_active || sd->next_active == entity)

5323

++		ret = bfq_update_next_active(sd);

5324

++

5325

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5326

++		bfq_forget_entity(st, entity);

5327

++	else

5328

++		bfq_idle_insert(st, entity);

5329

++

5330

++	BUG_ON(sd->active_entity == entity);

5331

++	BUG_ON(sd->next_active == entity);

5332

++

5333

++	return ret;

5334

++}

5335

++

5336

++/**

5337

++ * bfq_deactivate_entity - deactivate an entity.

5338

++ * @entity: the entity to deactivate.

5339

++ * @requeue: true if the entity can be put on the idle tree

5340

++ */

5341

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5342

++{

5343

++	struct bfq_sched_data *sd;

5344

++	struct bfq_entity *parent;

5345

++

5346

++	for_each_entity_safe(entity, parent) {

5347

++		sd = entity->sched_data;

5348

++

5349

++		if (!__bfq_deactivate_entity(entity, requeue))

5350

++			/*

5351

++			 * The parent entity is still backlogged, and

5352

++			 * we don't need to update it as it is still

5353

++			 * under service.

5354

++			 */

5355

++			break;

5356

++

5357

++		if (sd->next_active != NULL)

5358

++			/*

5359

++			 * The parent entity is still backlogged and

5360

++			 * the budgets on the path towards the root

5361

++			 * need to be updated.

5362

++			 */

5363

++			goto update;

5364

++

5365

++		/*

5366

++		 * If we reach there the parent is no more backlogged and

5367

++		 * we want to propagate the dequeue upwards.

5368

++		 */

5369

++		requeue = 1;

5370

++	}

5371

++

5372

++	return;

5373

++

5374

++update:

5375

++	entity = parent;

5376

++	for_each_entity(entity) {

5377

++		__bfq_activate_entity(entity);

5378

++

5379

++		sd = entity->sched_data;

5380

++		if (!bfq_update_next_active(sd))

5381

++			break;

5382

++	}

5383

++}

5384

++

5385

++/**

5386

++ * bfq_update_vtime - update vtime if necessary.

5387

++ * @st: the service tree to act upon.

5388

++ *

5389

++ * If necessary update the service tree vtime to have at least one

5390

++ * eligible entity, skipping to its start time.  Assumes that the

5391

++ * active tree of the device is not empty.

5392

++ *

5393

++ * NOTE: this hierarchical implementation updates vtimes quite often,

5394

++ * we may end up with reactivated tasks getting timestamps after a

5395

++ * vtime skip done because we needed a ->first_active entity on some

5396

++ * intermediate node.

5397

++ */

5398

++static void bfq_update_vtime(struct bfq_service_tree *st)

5399

++{

5400

++	struct bfq_entity *entry;

5401

++	struct rb_node *node = st->active.rb_node;

5402

++

5403

++	entry = rb_entry(node, struct bfq_entity, rb_node);

5404

++	if (bfq_gt(entry->min_start, st->vtime)) {

5405

++		st->vtime = entry->min_start;

5406

++		bfq_forget_idle(st);

5407

++	}

5408

++}

5409

++

5410

++/**

5411

++ * bfq_first_active - find the eligible entity with the smallest finish time

5412

++ * @st: the service tree to select from.

5413

++ *

5414

++ * This function searches the first schedulable entity, starting from the

5415

++ * root of the tree and going on the left every time on this side there is

5416

++ * a subtree with at least one eligible (start >= vtime) entity.  The path

5417

++ * on the right is followed only if a) the left subtree contains no eligible

5418

++ * entities and b) no eligible entity has been found yet.

5419

++ */

5420

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

5421

++{

5422

++	struct bfq_entity *entry, *first = NULL;

5423

++	struct rb_node *node = st->active.rb_node;

5424

++

5425

++	while (node != NULL) {

5426

++		entry = rb_entry(node, struct bfq_entity, rb_node);

5427

++left:

5428

++		if (!bfq_gt(entry->start, st->vtime))

5429

++			first = entry;

5430

++

5431

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

5432

++

5433

++		if (node->rb_left != NULL) {

5434

++			entry = rb_entry(node->rb_left,

5435

++					 struct bfq_entity, rb_node);

5436

++			if (!bfq_gt(entry->min_start, st->vtime)) {

5437

++				node = node->rb_left;

5438

++				goto left;

5439

++			}

5440

++		}

5441

++		if (first != NULL)

5442

++			break;

5443

++		node = node->rb_right;

5444

++	}

5445

++

5446

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

5447

++	return first;

5448

++}

5449

++

5450

++/**

5451

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

5452

++ * @st: the service tree.

5453

++ *

5454

++ * Update the virtual time in @st and return the first eligible entity

5455

++ * it contains.

5456

++ */

5457

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

5458

++						   bool force)

5459

++{

5460

++	struct bfq_entity *entity, *new_next_active = NULL;

5461

++

5462

++	if (RB_EMPTY_ROOT(&st->active))

5463

++		return NULL;

5464

++

5465

++	bfq_update_vtime(st);

5466

++	entity = bfq_first_active_entity(st);

5467

++	BUG_ON(bfq_gt(entity->start, st->vtime));

5468

++

5469

++	/*

5470

++	 * If the chosen entity does not match with the sched_data's

5471

++	 * next_active and we are forcedly serving the IDLE priority

5472

++	 * class tree, bubble up budget update.

5473

++	 */

5474

++	if (unlikely(force && entity != entity->sched_data->next_active)) {

5475

++		new_next_active = entity;

5476

++		for_each_entity(new_next_active)

5477

++			bfq_update_budget(new_next_active);

5478

++	}

5479

++

5480

++	return entity;

5481

++}

5482

++

5483

++/**

5484

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

5485

++ * @sd: the sched_data.

5486

++ * @extract: if true the returned entity will be also extracted from @sd.

5487

++ *

5488

++ * NOTE: since we cache the next_active entity at each level of the

5489

++ * hierarchy, the complexity of the lookup can be decreased with

5490

++ * absolutely no effort just returning the cached next_active value;

5491

++ * we prefer to do full lookups to test the consistency of * the data

5492

++ * structures.

5493

++ */

5494

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5495

++						 int extract,

5496

++						 struct bfq_data *bfqd)

5497

++{

5498

++	struct bfq_service_tree *st = sd->service_tree;

5499

++	struct bfq_entity *entity;

5500

++	int i = 0;

5501

++

5502

++	BUG_ON(sd->active_entity != NULL);

5503

++

5504

++	if (bfqd != NULL &&

5505

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

5506

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

5507

++						  true);

5508

++		if (entity != NULL) {

5509

++			i = BFQ_IOPRIO_CLASSES - 1;

5510

++			bfqd->bfq_class_idle_last_service = jiffies;

5511

++			sd->next_active = entity;

5512

++		}

5513

++	}

5514

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

5515

++		entity = __bfq_lookup_next_entity(st + i, false);

5516

++		if (entity != NULL) {

5517

++			if (extract) {

5518

++				bfq_check_next_active(sd, entity);

5519

++				bfq_active_extract(st + i, entity);

5520

++				sd->active_entity = entity;

5521

++				sd->next_active = NULL;

5522

++			}

5523

++			break;

5524

++		}

5525

++	}

5526

++

5527

++	return entity;

5528

++}

5529

++

5530

++/*

5531

++ * Get next queue for service.

5532

++ */

5533

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

5534

++{

5535

++	struct bfq_entity *entity = NULL;

5536

++	struct bfq_sched_data *sd;

5537

++	struct bfq_queue *bfqq;

5538

++

5539

++	BUG_ON(bfqd->in_service_queue != NULL);

5540

++

5541

++	if (bfqd->busy_queues == 0)

5542

++		return NULL;

5543

++

5544

++	sd = &bfqd->root_group->sched_data;

5545

++	for (; sd != NULL; sd = entity->my_sched_data) {

5546

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

5547

++		BUG_ON(entity == NULL);

5548

++		entity->service = 0;

5549

++	}

5550

++

5551

++	bfqq = bfq_entity_to_bfqq(entity);

5552

++	BUG_ON(bfqq == NULL);

5553

++

5554

++	return bfqq;

5555

++}

5556

++

5557

++/*

5558

++ * Forced extraction of the given queue.

5559

++ */

5560

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

5561

++				      struct bfq_queue *bfqq)

5562

++{

5563

++	struct bfq_entity *entity;

5564

++	struct bfq_sched_data *sd;

5565

++

5566

++	BUG_ON(bfqd->in_service_queue != NULL);

5567

++

5568

++	entity = &bfqq->entity;

5569

++	/*

5570

++	 * Bubble up extraction/update from the leaf to the root.

5571

++	*/

5572

++	for_each_entity(entity) {

5573

++		sd = entity->sched_data;

5574

++		bfq_update_budget(entity);

5575

++		bfq_update_vtime(bfq_entity_service_tree(entity));

5576

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

5577

++		sd->active_entity = entity;

5578

++		sd->next_active = NULL;

5579

++		entity->service = 0;

5580

++	}

5581

++

5582

++	return;

5583

++}

5584

++

5585

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

5586

++{

5587

++	if (bfqd->in_service_bic != NULL) {

5588

++		put_io_context(bfqd->in_service_bic->icq.ioc);

5589

++		bfqd->in_service_bic = NULL;

5590

++	}

5591

++

5592

++	bfqd->in_service_queue = NULL;

5593

++	del_timer(&bfqd->idle_slice_timer);

5594

++}

5595

++

5596

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5597

++				int requeue)

5598

++{

5599

++	struct bfq_entity *entity = &bfqq->entity;

5600

++

5601

++	if (bfqq == bfqd->in_service_queue)

5602

++		__bfq_bfqd_reset_in_service(bfqd);

5603

++

5604

++	bfq_deactivate_entity(entity, requeue);

5605

++}

5606

++

5607

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5608

++{

5609

++	struct bfq_entity *entity = &bfqq->entity;

5610

++

5611

++	bfq_activate_entity(entity);

5612

++}

5613

++

5614

++/*

5615

++ * Called when the bfqq no longer has requests pending, remove it from

5616

++ * the service tree.

5617

++ */

5618

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5619

++			      int requeue)

5620

++{

5621

++	BUG_ON(!bfq_bfqq_busy(bfqq));

5622

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

5623

++

5624

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

5625

++

5626

++	bfq_clear_bfqq_busy(bfqq);

5627

++

5628

++	BUG_ON(bfqd->busy_queues == 0);

5629

++	bfqd->busy_queues--;

5630

++	if (bfqq->raising_coeff > 1)

5631

++		bfqd->raised_busy_queues--;

5632

++

5633

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

5634

++}

5635

++

5636

++/*

5637

++ * Called when an inactive queue receives a new request.

5638

++ */

5639

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5640

++{

5641

++	BUG_ON(bfq_bfqq_busy(bfqq));

5642

++	BUG_ON(bfqq == bfqd->in_service_queue);

5643

++

5644

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

5645

++

5646

++	bfq_activate_bfqq(bfqd, bfqq);

5647

++

5648

++	bfq_mark_bfqq_busy(bfqq);

5649

++	bfqd->busy_queues++;

5650

++	if (bfqq->raising_coeff > 1)

5651

++		bfqd->raised_busy_queues++;

5652

++}

5653

+diff --git a/block/bfq.h b/block/bfq.h

5654

+new file mode 100644

5655

+index 0000000..68b28e3

5656

+--- /dev/null

5657

++++ b/block/bfq.h

5658

+@@ -0,0 +1,614 @@

5659

++/*

5660

++ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.

5661

++ *

5662

++ * Based on ideas and code from CFQ:

5663

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5664

++ *

5665

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5666

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5667

++ *

5668

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5669

++ */

5670

++

5671

++#ifndef _BFQ_H

5672

++#define _BFQ_H

5673

++

5674

++#include <linux/blktrace_api.h>

5675

++#include <linux/hrtimer.h>

5676

++#include <linux/ioprio.h>

5677

++#include <linux/rbtree.h>

5678

++

5679

++#define BFQ_IOPRIO_CLASSES	3

5680

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

5681

++

5682

++#define BFQ_MIN_WEIGHT	1

5683

++#define BFQ_MAX_WEIGHT	1000

5684

++

5685

++#define BFQ_DEFAULT_GRP_WEIGHT	10

5686

++#define BFQ_DEFAULT_GRP_IOPRIO	0

5687

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

5688

++

5689

++struct bfq_entity;

5690

++

5691

++/**

5692

++ * struct bfq_service_tree - per ioprio_class service tree.

5693

++ * @active: tree for active entities (i.e., those backlogged).

5694

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

5695

++ * @first_idle: idle entity with minimum F_i.

5696

++ * @last_idle: idle entity with maximum F_i.

5697

++ * @vtime: scheduler virtual time.

5698

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

5699

++ *

5700

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

5701

++ * ioprio_class has its own independent scheduler, and so its own

5702

++ * bfq_service_tree.  All the fields are protected by the queue lock

5703

++ * of the containing bfqd.

5704

++ */

5705

++struct bfq_service_tree {

5706

++	struct rb_root active;

5707

++	struct rb_root idle;

5708

++

5709

++	struct bfq_entity *first_idle;

5710

++	struct bfq_entity *last_idle;

5711

++

5712

++	u64 vtime;

5713

++	unsigned long wsum;

5714

++};

5715

++

5716

++/**

5717

++ * struct bfq_sched_data - multi-class scheduler.

5718

++ * @active_entity: entity under service.

5719

++ * @next_active: head-of-the-line entity in the scheduler.

5720

++ * @service_tree: array of service trees, one per ioprio_class.

5721

++ *

5722

++ * bfq_sched_data is the basic scheduler queue.  It supports three

5723

++ * ioprio_classes, and can be used either as a toplevel queue or as

5724

++ * an intermediate queue on a hierarchical setup.

5725

++ * @next_active points to the active entity of the sched_data service

5726

++ * trees that will be scheduled next.

5727

++ *

5728

++ * The supported ioprio_classes are the same as in CFQ, in descending

5729

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

5730

++ * Requests from higher priority queues are served before all the

5731

++ * requests from lower priority queues; among requests of the same

5732

++ * queue requests are served according to B-WF2Q+.

5733

++ * All the fields are protected by the queue lock of the containing bfqd.

5734

++ */

5735

++struct bfq_sched_data {

5736

++	struct bfq_entity *active_entity;

5737

++	struct bfq_entity *next_active;

5738

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

5739

++};

5740

++

5741

++/**

5742

++ * struct bfq_entity - schedulable entity.

5743

++ * @rb_node: service_tree member.

5744

++ * @on_st: flag, true if the entity is on a tree (either the active or

5745

++ *         the idle one of its service_tree).

5746

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

5747

++ * @start: B-WF2Q+ start timestamp (aka S_i).

5748

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

5749

++ * @min_start: minimum start time of the (active) subtree rooted at

5750

++ *             this entity; used for O(log N) lookups into active trees.

5751

++ * @service: service received during the last round of service.

5752

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

5753

++ * @weight: weight of the queue

5754

++ * @parent: parent entity, for hierarchical scheduling.

5755

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

5756

++ *                 associated scheduler queue, %NULL on leaf nodes.

5757

++ * @sched_data: the scheduler queue this entity belongs to.

5758

++ * @ioprio: the ioprio in use.

5759

++ * @new_weight: when a weight change is requested, the new weight value.

5760

++ * @orig_weight: original weight, used to implement weight boosting

5761

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

5762

++ * @ioprio_class: the ioprio_class in use.

5763

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

5764

++ *                    ioprio_class value.

5765

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

5766

++ *                  ioprio_class change.

5767

++ *

5768

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

5769

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

5770

++ * entity belongs to the sched_data of the parent group in the cgroup

5771

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

5772

++ * in @my_sched_data.

5773

++ *

5774

++ * Each entity stores independently its priority values; this would

5775

++ * allow different weights on different devices, but this

5776

++ * functionality is not exported to userspace by now.  Priorities and

5777

++ * weights are updated lazily, first storing the new values into the

5778

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

5779

++ * there is a transition in the entity state that allows the priority

5780

++ * update to take place the effective and the requested priority

5781

++ * values are synchronized.

5782

++ *

5783

++ * Unless cgroups are used, the weight value is calculated from the

5784

++ * ioprio to export the same interface as CFQ.  When dealing with

5785

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

5786

++ * time to consume their budget and have true sequential behavior, and

5787

++ * when there are no external factors breaking anticipation) the

5788

++ * relative weights at each level of the cgroups hierarchy should be

5789

++ * guaranteed.  All the fields are protected by the queue lock of the

5790

++ * containing bfqd.

5791

++ */

5792

++struct bfq_entity {

5793

++	struct rb_node rb_node;

5794

++

5795

++	int on_st;

5796

++

5797

++	u64 finish;

5798

++	u64 start;

5799

++

5800

++	struct rb_root *tree;

5801

++

5802

++	u64 min_start;

5803

++

5804

++	unsigned long service, budget;

5805

++	unsigned short weight, new_weight;

5806

++	unsigned short orig_weight;

5807

++

5808

++	struct bfq_entity *parent;

5809

++

5810

++	struct bfq_sched_data *my_sched_data;

5811

++	struct bfq_sched_data *sched_data;

5812

++

5813

++	unsigned short ioprio, new_ioprio;

5814

++	unsigned short ioprio_class, new_ioprio_class;

5815

++

5816

++	int ioprio_changed;

5817

++};

5818

++

5819

++struct bfq_group;

5820

++

5821

++/**

5822

++ * struct bfq_queue - leaf schedulable entity.

5823

++ * @ref: reference counter.

5824

++ * @bfqd: parent bfq_data.

5825

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

5826

++ *           one or more other queues.

5827

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

5828

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

5829

++ * @sort_list: sorted list of pending requests.

5830

++ * @next_rq: if fifo isn't expired, next request to serve.

5831

++ * @queued: nr of requests queued in @sort_list.

5832

++ * @allocated: currently allocated requests.

5833

++ * @meta_pending: pending metadata requests.

5834

++ * @fifo: fifo list of requests in sort_list.

5835

++ * @entity: entity representing this queue in the scheduler.

5836

++ * @max_budget: maximum budget allowed from the feedback mechanism.

5837

++ * @budget_timeout: budget expiration (in jiffies).

5838

++ * @dispatched: number of requests on the dispatch list or inside driver.

5839

++ * @org_ioprio: saved ioprio during boosted periods.

5840

++ * @flags: status flags.

5841

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

5842

++ * @seek_samples: number of seeks sampled

5843

++ * @seek_total: sum of the distances of the seeks sampled

5844

++ * @seek_mean: mean seek distance

5845

++ * @last_request_pos: position of the last request enqueued

5846

++ * @pid: pid of the process owning the queue, used for logging purposes.

5847

++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

5848

++ * @raising_cur_max_time: current max raising time for this queue

5849

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

5850

++ *                      idle to backlogged

5851

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

5852

++ *                           since the last transition from idle to backlogged

5853

++ *

5854

++ * A bfq_queue is a leaf request queue; it can be associated to an io_context

5855

++ * or more (if it is an async one).  @cgroup holds a reference to the

5856

++ * cgroup, to be sure that it does not disappear while a bfqq still

5857

++ * references it (mostly to avoid races between request issuing and task

5858

++ * migration followed by cgroup distruction).

5859

++ * All the fields are protected by the queue lock of the containing bfqd.

5860

++ */

5861

++struct bfq_queue {

5862

++	atomic_t ref;

5863

++	struct bfq_data *bfqd;

5864

++

5865

++	/* fields for cooperating queues handling */

5866

++	struct bfq_queue *new_bfqq;

5867

++	struct rb_node pos_node;

5868

++	struct rb_root *pos_root;

5869

++

5870

++	struct rb_root sort_list;

5871

++	struct request *next_rq;

5872

++	int queued[2];

5873

++	int allocated[2];

5874

++	int meta_pending;

5875

++	struct list_head fifo;

5876

++

5877

++	struct bfq_entity entity;

5878

++

5879

++	unsigned long max_budget;

5880

++	unsigned long budget_timeout;

5881

++

5882

++	int dispatched;

5883

++

5884

++	unsigned short org_ioprio;

5885

++

5886

++	unsigned int flags;

5887

++

5888

++	struct list_head bfqq_list;

5889

++

5890

++	unsigned int seek_samples;

5891

++	u64 seek_total;

5892

++	sector_t seek_mean;

5893

++	sector_t last_request_pos;

5894

++

5895

++	pid_t pid;

5896

++

5897

++	/* weight-raising fields */

5898

++	unsigned int raising_cur_max_time;

5899

++	unsigned long soft_rt_next_start;

5900

++	u64 last_rais_start_finish;

5901

++	unsigned int raising_coeff;

5902

++	u64 last_idle_bklogged;

5903

++	unsigned long service_from_backlogged;

5904

++};

5905

++

5906

++/**

5907

++ * struct bfq_ttime - per process thinktime stats.

5908

++ * @ttime_total: total process thinktime

5909

++ * @ttime_samples: number of thinktime samples

5910

++ * @ttime_mean: average process thinktime

5911

++ */

5912

++struct bfq_ttime {

5913

++	unsigned long last_end_request;

5914

++

5915

++	unsigned long ttime_total;

5916

++	unsigned long ttime_samples;

5917

++	unsigned long ttime_mean;

5918

++};

5919

++

5920

++/**

5921

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

5922

++ * @icq: associated io_cq structure

5923

++ * @bfqq: array of two process queues, the sync and the async

5924

++ * @ttime: associated @bfq_ttime struct

5925

++ */

5926

++struct bfq_io_cq {

5927

++	struct io_cq icq; /* must be the first member */

5928

++	struct bfq_queue *bfqq[2];

5929

++	struct bfq_ttime ttime;

5930

++	int ioprio;

5931

++};

5932

++

5933

++/**

5934

++ * struct bfq_data - per device data structure.

5935

++ * @queue: request queue for the managed device.

5936

++ * @root_group: root bfq_group for the device.

5937

++ * @rq_pos_tree: rbtree sorted by next_request position,

5938

++ *		used when determining if two or more queues

5939

++ *		have interleaving requests (see bfq_close_cooperator).

5940

++ * @busy_queues: number of bfq_queues containing requests (including the

5941

++ *		 queue under service, even if it is idling).

5942

++ * @raised_busy_queues: number of weight-raised busy bfq_queues.

5943

++ * @queued: number of queued requests.

5944

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

5945

++ * @sync_flight: number of sync requests in the driver.

5946

++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

5947

++ *		      completed requests .

5948

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

5949

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

5950

++ * @budgets_assigned: number of budgets assigned.

5951

++ * @idle_slice_timer: timer set when idling for the next sequential request

5952

++ *                    from the queue under service.

5953

++ * @unplug_work: delayed work to restart dispatching on the request queue.

5954

++ * @in_service_queue: bfq_queue under service.

5955

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

5956

++ * @last_position: on-disk position of the last served request.

5957

++ * @last_budget_start: beginning of the last budget.

5958

++ * @last_idling_start: beginning of the last idle slice.

5959

++ * @peak_rate: peak transfer rate observed for a budget.

5960

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

5961

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

5962

++ * @group_list: list of all the bfq_groups active on the device.

5963

++ * @active_list: list of all the bfq_queues active on the device.

5964

++ * @idle_list: list of all the bfq_queues idle on the device.

5965

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

5966

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

5967

++ *                   requests are served in fifo order.

5968

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

5969

++ * @bfq_back_max: maximum allowed backward seek.

5970

++ * @bfq_slice_idle: maximum idling time.

5971

++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

5972

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

5973

++ *                           async queues.

5974

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

5975

++ *               to prevent seeky queues to impose long latencies to well

5976

++ *               behaved ones (this also implies that seeky queues cannot

5977

++ *               receive guarantees in the service domain; after a timeout

5978

++ *               they are charged for the whole allocated budget, to try

5979

++ *               to preserve a behavior reasonably fair among them, but

5980

++ *               without service-domain guarantees).

5981

++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

5982

++ *                            queue is multiplied

5983

++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

5984

++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

5985

++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

5986

++ *			       may be reactivated for a queue (in jiffies)

5987

++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

5988

++ *				     after which weight-raising may be

5989

++ *				     reactivated for an already busy queue

5990

++ *				     (in jiffies)

5991

++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

5992

++ *			         sectors per seconds

5993

++ * @RT_prod: cached value of the product R*T used for computing the maximum

5994

++ *	     duration of the weight raising automatically

5995

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

5996

++ *

5997

++ * All the fields are protected by the @queue lock.

5998

++ */

5999

++struct bfq_data {

6000

++	struct request_queue *queue;

6001

++

6002

++	struct bfq_group *root_group;

6003

++

6004

++	struct rb_root rq_pos_tree;

6005

++

6006

++	int busy_queues;

6007

++	int raised_busy_queues;

6008

++	int queued;

6009

++	int rq_in_driver;

6010

++	int sync_flight;

6011

++

6012

++	int max_rq_in_driver;

6013

++	int hw_tag_samples;

6014

++	int hw_tag;

6015

++

6016

++	int budgets_assigned;

6017

++

6018

++	struct timer_list idle_slice_timer;

6019

++	struct work_struct unplug_work;

6020

++

6021

++	struct bfq_queue *in_service_queue;

6022

++	struct bfq_io_cq *in_service_bic;

6023

++

6024

++	sector_t last_position;

6025

++

6026

++	ktime_t last_budget_start;

6027

++	ktime_t last_idling_start;

6028

++	int peak_rate_samples;

6029

++	u64 peak_rate;

6030

++	unsigned long bfq_max_budget;

6031

++

6032

++	struct hlist_head group_list;

6033

++	struct list_head active_list;

6034

++	struct list_head idle_list;

6035

++

6036

++	unsigned int bfq_quantum;

6037

++	unsigned int bfq_fifo_expire[2];

6038

++	unsigned int bfq_back_penalty;

6039

++	unsigned int bfq_back_max;

6040

++	unsigned int bfq_slice_idle;

6041

++	u64 bfq_class_idle_last_service;

6042

++

6043

++	unsigned int bfq_user_max_budget;

6044

++	unsigned int bfq_max_budget_async_rq;

6045

++	unsigned int bfq_timeout[2];

6046

++

6047

++	bool low_latency;

6048

++

6049

++	/* parameters of the low_latency heuristics */

6050

++	unsigned int bfq_raising_coeff;

6051

++	unsigned int bfq_raising_max_time;

6052

++	unsigned int bfq_raising_rt_max_time;

6053

++	unsigned int bfq_raising_min_idle_time;

6054

++	unsigned long bfq_raising_min_inter_arr_async;

6055

++	unsigned int bfq_raising_max_softrt_rate;

6056

++	u64 RT_prod;

6057

++

6058

++	struct bfq_queue oom_bfqq;

6059

++};

6060

++

6061

++enum bfqq_state_flags {

6062

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

6063

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6064

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6065

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6066

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6067

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

6068

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6069

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6070

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6071

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6072

++	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

6073

++};

6074

++

6075

++#define BFQ_BFQQ_FNS(name)						\

6076

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6077

++{									\

6078

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6079

++}									\

6080

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6081

++{									\

6082

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6083

++}									\

6084

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6085

++{									\

6086

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6087

++}

6088

++

6089

++BFQ_BFQQ_FNS(busy);

6090

++BFQ_BFQQ_FNS(wait_request);

6091

++BFQ_BFQQ_FNS(must_alloc);

6092

++BFQ_BFQQ_FNS(fifo_expire);

6093

++BFQ_BFQQ_FNS(idle_window);

6094

++BFQ_BFQQ_FNS(prio_changed);

6095

++BFQ_BFQQ_FNS(sync);

6096

++BFQ_BFQQ_FNS(budget_new);

6097

++BFQ_BFQQ_FNS(coop);

6098

++BFQ_BFQQ_FNS(split_coop);

6099

++BFQ_BFQQ_FNS(softrt_update);

6100

++#undef BFQ_BFQQ_FNS

6101

++

6102

++/* Logging facilities. */

6103

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6104

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6105

++

6106

++#define bfq_log(bfqd, fmt, args...) \

6107

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6108

++

6109

++/* Expiration reasons. */

6110

++enum bfqq_expiration {

6111

++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

6112

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6113

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6114

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6115

++};

6116

++

6117

++#ifdef CONFIG_CGROUP_BFQIO

6118

++/**

6119

++ * struct bfq_group - per (device, cgroup) data structure.

6120

++ * @entity: schedulable entity to insert into the parent group sched_data.

6121

++ * @sched_data: own sched_data, to contain child entities (they may be

6122

++ *              both bfq_queues and bfq_groups).

6123

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6124

++ *              list of the containing cgroup's bfqio_cgroup.

6125

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6126

++ *             of the groups active on the same device; used for cleanup.

6127

++ * @bfqd: the bfq_data for the device this group acts upon.

6128

++ * @async_bfqq: array of async queues for all the tasks belonging to

6129

++ *              the group, one queue per ioprio value per ioprio_class,

6130

++ *              except for the idle class that has only one queue.

6131

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6132

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6133

++ *             to avoid too many special cases during group creation/migration.

6134

++ *

6135

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6136

++ * there is a set of bfq_groups, each one collecting the lower-level

6137

++ * entities belonging to the group that are acting on the same device.

6138

++ *

6139

++ * Locking works as follows:

6140

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6141

++ *      via RCU from its readers.

6142

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

6143

++ *      from the readers.

6144

++ *    o All the other fields are protected by the @bfqd queue lock.

6145

++ */

6146

++struct bfq_group {

6147

++	struct bfq_entity entity;

6148

++	struct bfq_sched_data sched_data;

6149

++

6150

++	struct hlist_node group_node;

6151

++	struct hlist_node bfqd_node;

6152

++

6153

++	void *bfqd;

6154

++

6155

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6156

++	struct bfq_queue *async_idle_bfqq;

6157

++

6158

++	struct bfq_entity *my_entity;

6159

++};

6160

++

6161

++/**

6162

++ * struct bfqio_cgroup - bfq cgroup data structure.

6163

++ * @css: subsystem state for bfq in the containing cgroup.

6164

++ * @online: flag marked when the subsystem is inserted.

6165

++ * @weight: cgroup weight.

6166

++ * @ioprio: cgroup ioprio.

6167

++ * @ioprio_class: cgroup ioprio_class.

6168

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

6169

++ * @group_data: list containing the bfq_group belonging to this cgroup.

6170

++ *

6171

++ * @group_data is accessed using RCU, with @lock protecting the updates,

6172

++ * @ioprio and @ioprio_class are protected by @lock.

6173

++ */

6174

++struct bfqio_cgroup {

6175

++	struct cgroup_subsys_state css;

6176

++	bool online;

6177

++

6178

++	unsigned short weight, ioprio, ioprio_class;

6179

++

6180

++	spinlock_t lock;

6181

++	struct hlist_head group_data;

6182

++};

6183

++#else

6184

++struct bfq_group {

6185

++	struct bfq_sched_data sched_data;

6186

++

6187

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6188

++	struct bfq_queue *async_idle_bfqq;

6189

++};

6190

++#endif

6191

++

6192

++static inline struct bfq_service_tree *

6193

++bfq_entity_service_tree(struct bfq_entity *entity)

6194

++{

6195

++	struct bfq_sched_data *sched_data = entity->sched_data;

6196

++	unsigned int idx = entity->ioprio_class - 1;

6197

++

6198

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

6199

++	BUG_ON(sched_data == NULL);

6200

++

6201

++	return sched_data->service_tree + idx;

6202

++}

6203

++

6204

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

6205

++					    int is_sync)

6206

++{

6207

++	return bic->bfqq[!!is_sync];

6208

++}

6209

++

6210

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

6211

++				struct bfq_queue *bfqq, int is_sync)

6212

++{

6213

++	bic->bfqq[!!is_sync] = bfqq;

6214

++}

6215

++

6216

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

6217

++{

6218

++	return bic->icq.q->elevator->elevator_data;

6219

++}

6220

++

6221

++/**

6222

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

6223

++ * @ptr: a pointer to a bfqd.

6224

++ * @flags: storage for the flags to be saved.

6225

++ *

6226

++ * This function allows bfqg->bfqd to be protected by the

6227

++ * queue lock of the bfqd they reference; the pointer is dereferenced

6228

++ * under RCU, so the storage for bfqd is assured to be safe as long

6229

++ * as the RCU read side critical section does not end.  After the

6230

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

6231

++ * sure that no other writer accessed it.  If we raced with a writer,

6232

++ * the function returns NULL, with the queue unlocked, otherwise it

6233

++ * returns the dereferenced pointer, with the queue locked.

6234

++ */

6235

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

6236

++						   unsigned long *flags)

6237

++{

6238

++	struct bfq_data *bfqd;

6239

++

6240

++	rcu_read_lock();

6241

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

6242

++

6243

++	if (bfqd != NULL) {

6244

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

6245

++		if (*ptr == bfqd)

6246

++			goto out;

6247

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6248

++	}

6249

++

6250

++	bfqd = NULL;

6251

++out:

6252

++	rcu_read_unlock();

6253

++	return bfqd;

6254

++}

6255

++

6256

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

6257

++				       unsigned long *flags)

6258

++{

6259

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6260

++}

6261

++

6262

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

6263

++static void bfq_put_queue(struct bfq_queue *bfqq);

6264

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

6265

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

6266

++				       struct bfq_group *bfqg, int is_sync,

6267

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

6268

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

6269

++					 struct bfq_group *bfqg);

6270

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

6271

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

6272

++#endif

6273

+--

6274

+1.8.5.2

6275

+

6276

6277

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

6278

===================================================================

6279

--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	2014-01-29 14:41:45 UTC (rev 2660)

6280

+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	2014-01-30 16:49:47 UTC (rev 2661)

6281

@@ -1,5773 +0,0 @@

6282

-From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001

6283

-From: Arianna Avanzini <avanzini.arianna@×××××.com>

6284

-Date: Thu, 9 May 2013 19:10:02 +0200

6285

-Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11

6286

-

6287

-Add the BFQ-v6r2 I/O scheduler to 3.11.

6288

-The general structure is borrowed from CFQ, as much code. A (bfq_)queue

6289

-is associated to each task doing I/O on a device, and each time a

6290

-scheduling decision has to be made a queue is selected and served until

6291

-it expires.

6292

-

6293

-    - Slices are given in the service domain: tasks are assigned

6294

-      budgets, measured in number of sectors. Once got the disk, a task

6295

-      must however consume its assigned budget within a configurable

6296

-      maximum time (by default, the maximum possible value of the

6297

-      budgets is automatically computed to comply with this timeout).

6298

-      This allows the desired latency vs "throughput boosting" tradeoff

6299

-      to be set.

6300

-

6301

-    - Budgets are scheduled according to a variant of WF2Q+, implemented

6302

-      using an augmented rb-tree to take eligibility into account while

6303

-      preserving an O(log N) overall complexity.

6304

-

6305

-    - A low-latency tunable is provided; if enabled, both interactive

6306

-      and soft real-time applications are guaranteed very low latency.

6307

-

6308

-    - Latency guarantees are preserved also in presence of NCQ.

6309

-

6310

-    - Also with flash-based devices, a high throughput is achieved while

6311

-      still preserving latency guarantees.

6312

-

6313

-    - Useful features borrowed from CFQ: cooperating-queues merging (with

6314

-      some additional optimizations with respect to the original CFQ version),

6315

-      static fallback queue for OOM.

6316

-

6317

-    - BFQ supports full hierarchical scheduling, exporting a cgroups

6318

-      interface.  Each node has a full scheduler, so each group can

6319

-      be assigned its own ioprio (mapped to a weight, see next point)

6320

-      and an ioprio_class.

6321

-

6322

-    - If the cgroups interface is used, weights can be explictly

6323

-      assigned, otherwise ioprio values are mapped to weights using the

6324

-      relation weight = IOPRIO_BE_NR - ioprio.

6325

-

6326

-    - ioprio classes are served in strict priority order, i.e., lower

6327

-      priority queues are not served as long as there are higher

6328

-      priority queues.  Among queues in the same class the bandwidth is

6329

-      distributed in proportion to the weight of each queue. A very

6330

-      thin extra bandwidth is however guaranteed to the Idle class, to

6331

-      prevent it from starving.

6332

-

6333

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

6334

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

6335

----

6336

- block/bfq-cgroup.c  |  881 +++++++++++++++

6337

- block/bfq-ioc.c     |   36 +

6338

- block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++

6339

- block/bfq-sched.c   | 1072 ++++++++++++++++++

6340

- block/bfq.h         |  603 ++++++++++

6341

- 5 files changed, 5674 insertions(+)

6342

- create mode 100644 block/bfq-cgroup.c

6343

- create mode 100644 block/bfq-ioc.c

6344

- create mode 100644 block/bfq-iosched.c

6345

- create mode 100644 block/bfq-sched.c

6346

- create mode 100644 block/bfq.h

6347

-

6348

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

6349

-new file mode 100644

6350

-index 0000000..bb9b851

6351

---- /dev/null

6352

-+++ b/block/bfq-cgroup.c

6353

-@@ -0,0 +1,881 @@

6354

-+/*

6355

-+ * BFQ: CGROUPS support.

6356

-+ *

6357

-+ * Based on ideas and code from CFQ:

6358

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6359

-+ *

6360

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6361

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

6362

-+ *

6363

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6364

-+ *

6365

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

6366

-+ */

6367

-+

6368

-+#ifdef CONFIG_CGROUP_BFQIO

6369

-+

6370

-+static DEFINE_MUTEX(bfqio_mutex);

6371

-+

6372

-+static bool bfqio_is_removed(struct cgroup *cgroup)

6373

-+{

6374

-+	return test_bit(CGRP_DEAD, &cgroup->flags);

6375

-+}

6376

-+

6377

-+static struct bfqio_cgroup bfqio_root_cgroup = {

6378

-+	.weight = BFQ_DEFAULT_GRP_WEIGHT,

6379

-+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

6380

-+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

6381

-+};

6382

-+

6383

-+static inline void bfq_init_entity(struct bfq_entity *entity,

6384

-+				   struct bfq_group *bfqg)

6385

-+{

6386

-+	entity->weight = entity->new_weight;

6387

-+	entity->orig_weight = entity->new_weight;

6388

-+	entity->ioprio = entity->new_ioprio;

6389

-+	entity->ioprio_class = entity->new_ioprio_class;

6390

-+	entity->parent = bfqg->my_entity;

6391

-+	entity->sched_data = &bfqg->sched_data;

6392

-+}

6393

-+

6394

-+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)

6395

-+{

6396

-+	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),

6397

-+			    struct bfqio_cgroup, css);

6398

-+}

6399

-+

6400

-+/*

6401

-+ * Search the bfq_group for bfqd into the hash table (by now only a list)

6402

-+ * of bgrp.  Must be called under rcu_read_lock().

6403

-+ */

6404

-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

6405

-+					    struct bfq_data *bfqd)

6406

-+{

6407

-+	struct bfq_group *bfqg;

6408

-+	void *key;

6409

-+

6410

-+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

6411

-+		key = rcu_dereference(bfqg->bfqd);

6412

-+		if (key == bfqd)

6413

-+			return bfqg;

6414

-+	}

6415

-+

6416

-+	return NULL;

6417

-+}

6418

-+

6419

-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

6420

-+					 struct bfq_group *bfqg)

6421

-+{

6422

-+	struct bfq_entity *entity = &bfqg->entity;

6423

-+

6424

-+	/*

6425

-+	 * If the weight of the entity has never been set via the sysfs

6426

-+	 * interface, then bgrp->weight == 0. In this case we initialize

6427

-+	 * the weight from the current ioprio value. Otherwise, the group

6428

-+	 * weight, if set, has priority over the ioprio value.

6429

-+	 */

6430

-+	if (bgrp->weight == 0) {

6431

-+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

6432

-+		entity->new_ioprio = bgrp->ioprio;

6433

-+	} else {

6434

-+		entity->new_weight = bgrp->weight;

6435

-+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

6436

-+	}

6437

-+	entity->orig_weight = entity->weight = entity->new_weight;

6438

-+	entity->ioprio = entity->new_ioprio;

6439

-+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

6440

-+	entity->my_sched_data = &bfqg->sched_data;

6441

-+}

6442

-+

6443

-+static inline void bfq_group_set_parent(struct bfq_group *bfqg,

6444

-+					struct bfq_group *parent)

6445

-+{

6446

-+	struct bfq_entity *entity;

6447

-+

6448

-+	BUG_ON(parent == NULL);

6449

-+	BUG_ON(bfqg == NULL);

6450

-+

6451

-+	entity = &bfqg->entity;

6452

-+	entity->parent = parent->my_entity;

6453

-+	entity->sched_data = &parent->sched_data;

6454

-+}

6455

-+

6456

-+/**

6457

-+ * bfq_group_chain_alloc - allocate a chain of groups.

6458

-+ * @bfqd: queue descriptor.

6459

-+ * @cgroup: the leaf cgroup this chain starts from.

6460

-+ *

6461

-+ * Allocate a chain of groups starting from the one belonging to

6462

-+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

6463

-+ * to the root has already an allocated group on @bfqd.

6464

-+ */

6465

-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

6466

-+					       struct cgroup *cgroup)

6467

-+{

6468

-+	struct bfqio_cgroup *bgrp;

6469

-+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

6470

-+

6471

-+	for (; cgroup != NULL; cgroup = cgroup->parent) {

6472

-+		bgrp = cgroup_to_bfqio(cgroup);

6473

-+

6474

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6475

-+		if (bfqg != NULL) {

6476

-+			/*

6477

-+			 * All the cgroups in the path from there to the

6478

-+			 * root must have a bfq_group for bfqd, so we don't

6479

-+			 * need any more allocations.

6480

-+			 */

6481

-+			break;

6482

-+		}

6483

-+

6484

-+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

6485

-+		if (bfqg == NULL)

6486

-+			goto cleanup;

6487

-+

6488

-+		bfq_group_init_entity(bgrp, bfqg);

6489

-+		bfqg->my_entity = &bfqg->entity;

6490

-+

6491

-+		if (leaf == NULL) {

6492

-+			leaf = bfqg;

6493

-+			prev = leaf;

6494

-+		} else {

6495

-+			bfq_group_set_parent(prev, bfqg);

6496

-+			/*

6497

-+			 * Build a list of allocated nodes using the bfqd

6498

-+			 * filed, that is still unused and will be initialized

6499

-+			 * only after the node will be connected.

6500

-+			 */

6501

-+			prev->bfqd = bfqg;

6502

-+			prev = bfqg;

6503

-+		}

6504

-+	}

6505

-+

6506

-+	return leaf;

6507

-+

6508

-+cleanup:

6509

-+	while (leaf != NULL) {

6510

-+		prev = leaf;

6511

-+		leaf = leaf->bfqd;

6512

-+		kfree(prev);

6513

-+	}

6514

-+

6515

-+	return NULL;

6516

-+}

6517

-+

6518

-+/**

6519

-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

6520

-+ * @bfqd: the queue descriptor.

6521

-+ * @cgroup: the leaf cgroup to start from.

6522

-+ * @leaf: the leaf group (to be associated to @cgroup).

6523

-+ *

6524

-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the

6525

-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the

6526

-+ * hierarchy that already as a group associated to @bfqd all the nodes

6527

-+ * in the path to the root cgroup have one too.

6528

-+ *

6529

-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy

6530

-+ * per device) while the bfqio_cgroup lock protects the list of groups

6531

-+ * belonging to the same cgroup.

6532

-+ */

6533

-+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

6534

-+				 struct bfq_group *leaf)

6535

-+{

6536

-+	struct bfqio_cgroup *bgrp;

6537

-+	struct bfq_group *bfqg, *next, *prev = NULL;

6538

-+	unsigned long flags;

6539

-+

6540

-+	assert_spin_locked(bfqd->queue->queue_lock);

6541

-+

6542

-+	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {

6543

-+		bgrp = cgroup_to_bfqio(cgroup);

6544

-+		next = leaf->bfqd;

6545

-+

6546

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6547

-+		BUG_ON(bfqg != NULL);

6548

-+

6549

-+		spin_lock_irqsave(&bgrp->lock, flags);

6550

-+

6551

-+		rcu_assign_pointer(leaf->bfqd, bfqd);

6552

-+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

6553

-+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

6554

-+

6555

-+		spin_unlock_irqrestore(&bgrp->lock, flags);

6556

-+

6557

-+		prev = leaf;

6558

-+		leaf = next;

6559

-+	}

6560

-+

6561

-+	BUG_ON(cgroup == NULL && leaf != NULL);

6562

-+	if (cgroup != NULL && prev != NULL) {

6563

-+		bgrp = cgroup_to_bfqio(cgroup);

6564

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6565

-+		bfq_group_set_parent(prev, bfqg);

6566

-+	}

6567

-+}

6568

-+

6569

-+/**

6570

-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

6571

-+ * @bfqd: queue descriptor.

6572

-+ * @cgroup: cgroup being searched for.

6573

-+ *

6574

-+ * Return a group associated to @bfqd in @cgroup, allocating one if

6575

-+ * necessary.  When a group is returned all the cgroups in the path

6576

-+ * to the root have a group associated to @bfqd.

6577

-+ *

6578

-+ * If the allocation fails, return the root group: this breaks guarantees

6579

-+ * but is a safe fallbak.  If this loss becames a problem it can be

6580

-+ * mitigated using the equivalent weight (given by the product of the

6581

-+ * weights of the groups in the path from @group to the root) in the

6582

-+ * root scheduler.

6583

-+ *

6584

-+ * We allocate all the missing nodes in the path from the leaf cgroup

6585

-+ * to the root and we connect the nodes only after all the allocations

6586

-+ * have been successful.

6587

-+ */

6588

-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

6589

-+					      struct cgroup *cgroup)

6590

-+{

6591

-+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

6592

-+	struct bfq_group *bfqg;

6593

-+

6594

-+	bfqg = bfqio_lookup_group(bgrp, bfqd);

6595

-+	if (bfqg != NULL)

6596

-+		return bfqg;

6597

-+

6598

-+	bfqg = bfq_group_chain_alloc(bfqd, cgroup);

6599

-+	if (bfqg != NULL)

6600

-+		bfq_group_chain_link(bfqd, cgroup, bfqg);

6601

-+	else

6602

-+		bfqg = bfqd->root_group;

6603

-+

6604

-+	return bfqg;

6605

-+}

6606

-+

6607

-+/**

6608

-+ * bfq_bfqq_move - migrate @bfqq to @bfqg.

6609

-+ * @bfqd: queue descriptor.

6610

-+ * @bfqq: the queue to move.

6611

-+ * @entity: @bfqq's entity.

6612

-+ * @bfqg: the group to move to.

6613

-+ *

6614

-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

6615

-+ * it on the new one.  Avoid putting the entity on the old group idle tree.

6616

-+ *

6617

-+ * Must be called under the queue lock; the cgroup owning @bfqg must

6618

-+ * not disappear (by now this just means that we are called under

6619

-+ * rcu_read_lock()).

6620

-+ */

6621

-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6622

-+			  struct bfq_entity *entity, struct bfq_group *bfqg)

6623

-+{

6624

-+	int busy, resume;

6625

-+

6626

-+	busy = bfq_bfqq_busy(bfqq);

6627

-+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

6628

-+

6629

-+	BUG_ON(resume && !entity->on_st);

6630

-+	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);

6631

-+

6632

-+	if (busy) {

6633

-+		BUG_ON(atomic_read(&bfqq->ref) < 2);

6634

-+

6635

-+		if (!resume)

6636

-+			bfq_del_bfqq_busy(bfqd, bfqq, 0);

6637

-+		else

6638

-+			bfq_deactivate_bfqq(bfqd, bfqq, 0);

6639

-+	} else if (entity->on_st)

6640

-+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

6641

-+

6642

-+	/*

6643

-+	 * Here we use a reference to bfqg.  We don't need a refcounter

6644

-+	 * as the cgroup reference will not be dropped, so that its

6645

-+	 * destroy() callback will not be invoked.

6646

-+	 */

6647

-+	entity->parent = bfqg->my_entity;

6648

-+	entity->sched_data = &bfqg->sched_data;

6649

-+

6650

-+	if (busy && resume)

6651

-+		bfq_activate_bfqq(bfqd, bfqq);

6652

-+

6653

-+	if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)

6654

-+		bfq_schedule_dispatch(bfqd);

6655

-+}

6656

-+

6657

-+/**

6658

-+ * __bfq_bic_change_cgroup - move @bic to @cgroup.

6659

-+ * @bfqd: the queue descriptor.

6660

-+ * @bic: the bic to move.

6661

-+ * @cgroup: the cgroup to move to.

6662

-+ *

6663

-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

6664

-+ * has to make sure that the reference to cgroup is valid across the call.

6665

-+ *

6666

-+ * NOTE: an alternative approach might have been to store the current

6667

-+ * cgroup in bfqq and getting a reference to it, reducing the lookup

6668

-+ * time here, at the price of slightly more complex code.

6669

-+ */

6670

-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

6671

-+						 struct bfq_io_cq *bic,

6672

-+						 struct cgroup *cgroup)

6673

-+{

6674

-+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

6675

-+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

6676

-+	struct bfq_entity *entity;

6677

-+	struct bfq_group *bfqg;

6678

-+	struct bfqio_cgroup *bgrp;

6679

-+

6680

-+	bgrp = cgroup_to_bfqio(cgroup);

6681

-+

6682

-+	bfqg = bfq_find_alloc_group(bfqd, cgroup);

6683

-+	if (async_bfqq != NULL) {

6684

-+		entity = &async_bfqq->entity;

6685

-+

6686

-+		if (entity->sched_data != &bfqg->sched_data) {

6687

-+			bic_set_bfqq(bic, NULL, 0);

6688

-+			bfq_log_bfqq(bfqd, async_bfqq,

6689

-+				     "bic_change_group: %p %d",

6690

-+				     async_bfqq, atomic_read(&async_bfqq->ref));

6691

-+			bfq_put_queue(async_bfqq);

6692

-+		}

6693

-+	}

6694

-+

6695

-+	if (sync_bfqq != NULL) {

6696

-+		entity = &sync_bfqq->entity;

6697

-+		if (entity->sched_data != &bfqg->sched_data)

6698

-+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

6699

-+	}

6700

-+

6701

-+	return bfqg;

6702

-+}

6703

-+

6704

-+/**

6705

-+ * bfq_bic_change_cgroup - move @bic to @cgroup.

6706

-+ * @bic: the bic being migrated.

6707

-+ * @cgroup: the destination cgroup.

6708

-+ *

6709

-+ * When the task owning @bic is moved to @cgroup, @bic is immediately

6710

-+ * moved into its new parent group.

6711

-+ */

6712

-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

6713

-+				  struct cgroup *cgroup)

6714

-+{

6715

-+	struct bfq_data *bfqd;

6716

-+	unsigned long uninitialized_var(flags);

6717

-+

6718

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

6719

-+	if (bfqd != NULL) {

6720

-+		__bfq_bic_change_cgroup(bfqd, bic, cgroup);

6721

-+		bfq_put_bfqd_unlock(bfqd, &flags);

6722

-+	}

6723

-+}

6724

-+

6725

-+/**

6726

-+ * bfq_bic_update_cgroup - update the cgroup of @bic.

6727

-+ * @bic: the @bic to update.

6728

-+ *

6729

-+ * Make sure that @bic is enqueued in the cgroup of the current task.

6730

-+ * We need this in addition to moving bics during the cgroup attach

6731

-+ * phase because the task owning @bic could be at its first disk

6732

-+ * access or we may end up in the root cgroup as the result of a

6733

-+ * memory allocation failure and here we try to move to the right

6734

-+ * group.

6735

-+ *

6736

-+ * Must be called under the queue lock.  It is safe to use the returned

6737

-+ * value even after the rcu_read_unlock() as the migration/destruction

6738

-+ * paths act under the queue lock too.  IOW it is impossible to race with

6739

-+ * group migration/destruction and end up with an invalid group as:

6740

-+ *   a) here cgroup has not yet been destroyed, nor its destroy callback

6741

-+ *      has started execution, as current holds a reference to it,

6742

-+ *   b) if it is destroyed after rcu_read_unlock() [after current is

6743

-+ *      migrated to a different cgroup] its attach() callback will have

6744

-+ *      taken care of remove all the references to the old cgroup data.

6745

-+ */

6746

-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

6747

-+{

6748

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

6749

-+	struct bfq_group *bfqg;

6750

-+	struct cgroup *cgroup;

6751

-+

6752

-+	BUG_ON(bfqd == NULL);

6753

-+

6754

-+	rcu_read_lock();

6755

-+	cgroup = task_cgroup(current, bfqio_subsys_id);

6756

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);

6757

-+	rcu_read_unlock();

6758

-+

6759

-+	return bfqg;

6760

-+}

6761

-+

6762

-+/**

6763

-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

6764

-+ * @st: the service tree being flushed.

6765

-+ */

6766

-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

6767

-+{

6768

-+	struct bfq_entity *entity = st->first_idle;

6769

-+

6770

-+	for (; entity != NULL; entity = st->first_idle)

6771

-+		__bfq_deactivate_entity(entity, 0);

6772

-+}

6773

-+

6774

-+/**

6775

-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

6776

-+ * @bfqd: the device data structure with the root group.

6777

-+ * @entity: the entity to move.

6778

-+ */

6779

-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

6780

-+					    struct bfq_entity *entity)

6781

-+{

6782

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

6783

-+

6784

-+	BUG_ON(bfqq == NULL);

6785

-+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

6786

-+	return;

6787

-+}

6788

-+

6789

-+/**

6790

-+ * bfq_reparent_active_entities - move to the root group all active entities.

6791

-+ * @bfqd: the device data structure with the root group.

6792

-+ * @bfqg: the group to move from.

6793

-+ * @st: the service tree with the entities.

6794

-+ *

6795

-+ * Needs queue_lock to be taken and reference to be valid over the call.

6796

-+ */

6797

-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

6798

-+						struct bfq_group *bfqg,

6799

-+						struct bfq_service_tree *st)

6800

-+{

6801

-+	struct rb_root *active = &st->active;

6802

-+	struct bfq_entity *entity = NULL;

6803

-+

6804

-+	if (!RB_EMPTY_ROOT(&st->active))

6805

-+		entity = bfq_entity_of(rb_first(active));

6806

-+

6807

-+	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))

6808

-+		bfq_reparent_leaf_entity(bfqd, entity);

6809

-+

6810

-+	if (bfqg->sched_data.active_entity != NULL)

6811

-+		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

6812

-+

6813

-+	return;

6814

-+}

6815

-+

6816

-+/**

6817

-+ * bfq_destroy_group - destroy @bfqg.

6818

-+ * @bgrp: the bfqio_cgroup containing @bfqg.

6819

-+ * @bfqg: the group being destroyed.

6820

-+ *

6821

-+ * Destroy @bfqg, making sure that it is not referenced from its parent.

6822

-+ */

6823

-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

6824

-+{

6825

-+	struct bfq_data *bfqd;

6826

-+	struct bfq_service_tree *st;

6827

-+	struct bfq_entity *entity = bfqg->my_entity;

6828

-+	unsigned long uninitialized_var(flags);

6829

-+	int i;

6830

-+

6831

-+	hlist_del(&bfqg->group_node);

6832

-+

6833

-+	/*

6834

-+	 * Empty all service_trees belonging to this group before deactivating

6835

-+	 * the group itself.

6836

-+	 */

6837

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

6838

-+		st = bfqg->sched_data.service_tree + i;

6839

-+

6840

-+		/*

6841

-+		 * The idle tree may still contain bfq_queues belonging

6842

-+		 * to exited task because they never migrated to a different

6843

-+		 * cgroup from the one being destroyed now.  Noone else

6844

-+		 * can access them so it's safe to act without any lock.

6845

-+		 */

6846

-+		bfq_flush_idle_tree(st);

6847

-+

6848

-+		/*

6849

-+		 * It may happen that some queues are still active

6850

-+		 * (busy) upon group destruction (if the corresponding

6851

-+		 * processes have been forced to terminate). We move

6852

-+		 * all the leaf entities corresponding to these queues

6853

-+		 * to the root_group.

6854

-+		 * Also, it may happen that the group has an entity

6855

-+		 * under service, which is disconnected from the active

6856

-+		 * tree: it must be moved, too.

6857

-+		 * There is no need to put the sync queues, as the

6858

-+		 * scheduler has taken no reference.

6859

-+		 */

6860

-+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6861

-+		if (bfqd != NULL) {

6862

-+			bfq_reparent_active_entities(bfqd, bfqg, st);

6863

-+			bfq_put_bfqd_unlock(bfqd, &flags);

6864

-+		}

6865

-+		BUG_ON(!RB_EMPTY_ROOT(&st->active));

6866

-+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

6867

-+	}

6868

-+	BUG_ON(bfqg->sched_data.next_active != NULL);

6869

-+	BUG_ON(bfqg->sched_data.active_entity != NULL);

6870

-+

6871

-+	/*

6872

-+	 * We may race with device destruction, take extra care when

6873

-+	 * dereferencing bfqg->bfqd.

6874

-+	 */

6875

-+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6876

-+	if (bfqd != NULL) {

6877

-+		hlist_del(&bfqg->bfqd_node);

6878

-+		__bfq_deactivate_entity(entity, 0);

6879

-+		bfq_put_async_queues(bfqd, bfqg);

6880

-+		bfq_put_bfqd_unlock(bfqd, &flags);

6881

-+	}

6882

-+	BUG_ON(entity->tree != NULL);

6883

-+

6884

-+	/*

6885

-+	 * No need to defer the kfree() to the end of the RCU grace

6886

-+	 * period: we are called from the destroy() callback of our

6887

-+	 * cgroup, so we can be sure that noone is a) still using

6888

-+	 * this cgroup or b) doing lookups in it.

6889

-+	 */

6890

-+	kfree(bfqg);

6891

-+}

6892

-+

6893

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

6894

-+{

6895

-+	struct hlist_node *tmp;

6896

-+	struct bfq_group *bfqg;

6897

-+

6898

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

6899

-+		bfq_end_raising_async_queues(bfqd, bfqg);

6900

-+}

6901

-+

6902

-+/**

6903

-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

6904

-+ * @bfqd: the device descriptor being exited.

6905

-+ *

6906

-+ * When the device exits we just make sure that no lookup can return

6907

-+ * the now unused group structures.  They will be deallocated on cgroup

6908

-+ * destruction.

6909

-+ */

6910

-+static void bfq_disconnect_groups(struct bfq_data *bfqd)

6911

-+{

6912

-+	struct hlist_node *tmp;

6913

-+	struct bfq_group *bfqg;

6914

-+

6915

-+	bfq_log(bfqd, "disconnect_groups beginning") ;

6916

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

6917

-+		hlist_del(&bfqg->bfqd_node);

6918

-+

6919

-+		__bfq_deactivate_entity(bfqg->my_entity, 0);

6920

-+

6921

-+		/*

6922

-+		 * Don't remove from the group hash, just set an

6923

-+		 * invalid key.  No lookups can race with the

6924

-+		 * assignment as bfqd is being destroyed; this

6925

-+		 * implies also that new elements cannot be added

6926

-+		 * to the list.

6927

-+		 */

6928

-+		rcu_assign_pointer(bfqg->bfqd, NULL);

6929

-+

6930

-+		bfq_log(bfqd, "disconnect_groups: put async for group %p",

6931

-+			bfqg) ;

6932

-+		bfq_put_async_queues(bfqd, bfqg);

6933

-+	}

6934

-+}

6935

-+

6936

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

6937

-+{

6938

-+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

6939

-+	struct bfq_group *bfqg = bfqd->root_group;

6940

-+

6941

-+	bfq_put_async_queues(bfqd, bfqg);

6942

-+

6943

-+	spin_lock_irq(&bgrp->lock);

6944

-+	hlist_del_rcu(&bfqg->group_node);

6945

-+	spin_unlock_irq(&bgrp->lock);

6946

-+

6947

-+	/*

6948

-+	 * No need to synchronize_rcu() here: since the device is gone

6949

-+	 * there cannot be any read-side access to its root_group.

6950

-+	 */

6951

-+	kfree(bfqg);

6952

-+}

6953

-+

6954

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

6955

-+{

6956

-+	struct bfq_group *bfqg;

6957

-+	struct bfqio_cgroup *bgrp;

6958

-+	int i;

6959

-+

6960

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

6961

-+	if (bfqg == NULL)

6962

-+		return NULL;

6963

-+

6964

-+	bfqg->entity.parent = NULL;

6965

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

6966

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

6967

-+

6968

-+	bgrp = &bfqio_root_cgroup;

6969

-+	spin_lock_irq(&bgrp->lock);

6970

-+	rcu_assign_pointer(bfqg->bfqd, bfqd);

6971

-+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

6972

-+	spin_unlock_irq(&bgrp->lock);

6973

-+

6974

-+	return bfqg;

6975

-+}

6976

-+

6977

-+#define SHOW_FUNCTION(__VAR)						\

6978

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\

6979

-+				       struct cftype *cftype)		\

6980

-+{									\

6981

-+	struct bfqio_cgroup *bgrp;					\

6982

-+	u64 ret = -ENODEV;						\

6983

-+									\

6984

-+	mutex_lock(&bfqio_mutex);					\

6985

-+	if (bfqio_is_removed(cgroup))					\

6986

-+		goto out_unlock;					\

6987

-+									\

6988

-+	bgrp = cgroup_to_bfqio(cgroup);					\

6989

-+	spin_lock_irq(&bgrp->lock);					\

6990

-+	ret = bgrp->__VAR;						\

6991

-+	spin_unlock_irq(&bgrp->lock);					\

6992

-+									\

6993

-+out_unlock:								\

6994

-+	mutex_unlock(&bfqio_mutex);					\

6995

-+	return ret;							\

6996

-+}

6997

-+

6998

-+SHOW_FUNCTION(weight);

6999

-+SHOW_FUNCTION(ioprio);

7000

-+SHOW_FUNCTION(ioprio_class);

7001

-+#undef SHOW_FUNCTION

7002

-+

7003

-+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

7004

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

7005

-+					struct cftype *cftype,		\

7006

-+					u64 val)			\

7007

-+{									\

7008

-+	struct bfqio_cgroup *bgrp;					\

7009

-+	struct bfq_group *bfqg;						\

7010

-+	int ret = -EINVAL;						\

7011

-+									\

7012

-+	if (val < (__MIN) || val > (__MAX))				\

7013

-+		return ret;						\

7014

-+									\

7015

-+	ret = -ENODEV;							\

7016

-+	mutex_lock(&bfqio_mutex);					\

7017

-+	if (bfqio_is_removed(cgroup))					\

7018

-+		goto out_unlock;					\

7019

-+	ret = 0;							\

7020

-+									\

7021

-+	bgrp = cgroup_to_bfqio(cgroup);					\

7022

-+									\

7023

-+	spin_lock_irq(&bgrp->lock);					\

7024

-+	bgrp->__VAR = (unsigned short)val;				\

7025

-+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

7026

-+		/*							\

7027

-+                 * Setting the ioprio_changed flag of the entity        \

7028

-+                 * to 1 with new_##__VAR == ##__VAR would re-set        \

7029

-+                 * the value of the weight to its ioprio mapping.       \

7030

-+                 * Set the flag only if necessary.                      \

7031

-+                 */                                                     \

7032

-+                if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

7033

-+                        bfqg->entity.new_##__VAR = (unsigned short)val; \

7034

-+                        smp_wmb();                                      \

7035

-+                        bfqg->entity.ioprio_changed = 1;                \

7036

-+                }							\

7037

-+	}								\

7038

-+	spin_unlock_irq(&bgrp->lock);					\

7039

-+									\

7040

-+out_unlock:								\

7041

-+	mutex_unlock(&bfqio_mutex);					\

7042

-+	return ret;							\

7043

-+}

7044

-+

7045

-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

7046

-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

7047

-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

7048

-+#undef STORE_FUNCTION

7049

-+

7050

-+static struct cftype bfqio_files[] = {

7051

-+	{

7052

-+		.name = "weight",

7053

-+		.read_u64 = bfqio_cgroup_weight_read,

7054

-+		.write_u64 = bfqio_cgroup_weight_write,

7055

-+	},

7056

-+	{

7057

-+		.name = "ioprio",

7058

-+		.read_u64 = bfqio_cgroup_ioprio_read,

7059

-+		.write_u64 = bfqio_cgroup_ioprio_write,

7060

-+	},

7061

-+	{

7062

-+		.name = "ioprio_class",

7063

-+		.read_u64 = bfqio_cgroup_ioprio_class_read,

7064

-+		.write_u64 = bfqio_cgroup_ioprio_class_write,

7065

-+	},

7066

-+	{ },	/* terminate */

7067

-+};

7068

-+

7069

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

7070

-+{

7071

-+	struct bfqio_cgroup *bgrp;

7072

-+

7073

-+	if (cgroup->parent != NULL) {

7074

-+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

7075

-+		if (bgrp == NULL)

7076

-+			return ERR_PTR(-ENOMEM);

7077

-+	} else

7078

-+		bgrp = &bfqio_root_cgroup;

7079

-+

7080

-+	spin_lock_init(&bgrp->lock);

7081

-+	INIT_HLIST_HEAD(&bgrp->group_data);

7082

-+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

7083

-+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

7084

-+

7085

-+	return &bgrp->css;

7086

-+}

7087

-+

7088

-+/*

7089

-+ * We cannot support shared io contexts, as we have no means to support

7090

-+ * two tasks with the same ioc in two different groups without major rework

7091

-+ * of the main bic/bfqq data structures.  By now we allow a task to change

7092

-+ * its cgroup only if it's the only owner of its ioc; the drawback of this

7093

-+ * behavior is that a group containing a task that forked using CLONE_IO

7094

-+ * will not be destroyed until the tasks sharing the ioc die.

7095

-+ */

7096

-+static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

7097

-+{

7098

-+	struct task_struct *task;

7099

-+	struct io_context *ioc;

7100

-+	int ret = 0;

7101

-+

7102

-+	cgroup_taskset_for_each(task, cgroup, tset) {

7103

-+		/* task_lock() is needed to avoid races with exit_io_context() */

7104

-+		task_lock(task);

7105

-+		ioc = task->io_context;

7106

-+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

7107

-+			/*

7108

-+			 * ioc == NULL means that the task is either too young or

7109

-+			 * exiting: if it has still no ioc the ioc can't be shared,

7110

-+			 * if the task is exiting the attach will fail anyway, no

7111

-+			 * matter what we return here.

7112

-+			 */

7113

-+			ret = -EINVAL;

7114

-+		task_unlock(task);

7115

-+		if (ret)

7116

-+			break;

7117

-+	}

7118

-+

7119

-+	return ret;

7120

-+}

7121

-+

7122

-+static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

7123

-+{

7124

-+	struct task_struct *task;

7125

-+	struct io_context *ioc;

7126

-+	struct io_cq *icq;

7127

-+

7128

-+	/*

7129

-+	 * IMPORTANT NOTE: The move of more than one process at a time to a

7130

-+	 * new group has not yet been tested.

7131

-+	 */

7132

-+	cgroup_taskset_for_each(task, cgroup, tset) {

7133

-+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

7134

-+		if (ioc) {

7135

-+			/*

7136

-+			 * Handle cgroup change here.

7137

-+			 */

7138

-+			rcu_read_lock();

7139

-+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

7140

-+				if (!strncmp(icq->q->elevator->type->elevator_name,

7141

-+					     "bfq", ELV_NAME_MAX))

7142

-+					bfq_bic_change_cgroup(icq_to_bic(icq),

7143

-+							      cgroup);

7144

-+			rcu_read_unlock();

7145

-+			put_io_context(ioc);

7146

-+		}

7147

-+	}

7148

-+}

7149

-+

7150

-+static void bfqio_destroy(struct cgroup *cgroup)

7151

-+{

7152

-+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

7153

-+	struct hlist_node *tmp;

7154

-+	struct bfq_group *bfqg;

7155

-+

7156

-+	/*

7157

-+	 * Since we are destroying the cgroup, there are no more tasks

7158

-+	 * referencing it, and all the RCU grace periods that may have

7159

-+	 * referenced it are ended (as the destruction of the parent

7160

-+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

7161

-+	 * anything else and we don't need any synchronization.

7162

-+	 */

7163

-+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

7164

-+		bfq_destroy_group(bgrp, bfqg);

7165

-+

7166

-+	BUG_ON(!hlist_empty(&bgrp->group_data));

7167

-+

7168

-+	kfree(bgrp);

7169

-+}

7170

-+

7171

-+struct cgroup_subsys bfqio_subsys = {

7172

-+	.name = "bfqio",

7173

-+	.css_alloc = bfqio_create,

7174

-+	.can_attach = bfqio_can_attach,

7175

-+	.attach = bfqio_attach,

7176

-+	.css_free = bfqio_destroy,

7177

-+	.subsys_id = bfqio_subsys_id,

7178

-+	.base_cftypes = bfqio_files,

7179

-+};

7180

-+#else

7181

-+static inline void bfq_init_entity(struct bfq_entity *entity,

7182

-+				   struct bfq_group *bfqg)

7183

-+{

7184

-+	entity->weight = entity->new_weight;

7185

-+	entity->orig_weight = entity->new_weight;

7186

-+	entity->ioprio = entity->new_ioprio;

7187

-+	entity->ioprio_class = entity->new_ioprio_class;

7188

-+	entity->sched_data = &bfqg->sched_data;

7189

-+}

7190

-+

7191

-+static inline struct bfq_group *

7192

-+bfq_bic_update_cgroup(struct bfq_io_cq *bic)

7193

-+{

7194

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

7195

-+	return bfqd->root_group;

7196

-+}

7197

-+

7198

-+static inline void bfq_bfqq_move(struct bfq_data *bfqd,

7199

-+				 struct bfq_queue *bfqq,

7200

-+				 struct bfq_entity *entity,

7201

-+				 struct bfq_group *bfqg)

7202

-+{

7203

-+}

7204

-+

7205

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

7206

-+{

7207

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

7208

-+}

7209

-+

7210

-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

7211

-+{

7212

-+	bfq_put_async_queues(bfqd, bfqd->root_group);

7213

-+}

7214

-+

7215

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

7216

-+{

7217

-+	kfree(bfqd->root_group);

7218

-+}

7219

-+

7220

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

7221

-+{

7222

-+	struct bfq_group *bfqg;

7223

-+	int i;

7224

-+

7225

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

7226

-+	if (bfqg == NULL)

7227

-+		return NULL;

7228

-+

7229

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

7230

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

7231

-+

7232

-+	return bfqg;

7233

-+}

7234

-+#endif

7235

-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

7236

-new file mode 100644

7237

-index 0000000..326e3ec

7238

---- /dev/null

7239

-+++ b/block/bfq-ioc.c

7240

-@@ -0,0 +1,36 @@

7241

-+/*

7242

-+ * BFQ: I/O context handling.

7243

-+ *

7244

-+ * Based on ideas and code from CFQ:

7245

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7246

-+ *

7247

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7248

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

7249

-+ *

7250

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7251

-+ */

7252

-+

7253

-+/**

7254

-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

7255

-+ * @icq: the iocontext queue.

7256

-+ */

7257

-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

7258

-+{

7259

-+	/* bic->icq is the first member, %NULL will convert to %NULL */

7260

-+	return container_of(icq, struct bfq_io_cq, icq);

7261

-+}

7262

-+

7263

-+/**

7264

-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

7265

-+ * @bfqd: the lookup key.

7266

-+ * @ioc: the io_context of the process doing I/O.

7267

-+ *

7268

-+ * Queue lock must be held.

7269

-+ */

7270

-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

7271

-+					       struct io_context *ioc)

7272

-+{

7273

-+	if(ioc)

7274

-+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

7275

-+	return NULL;

7276

-+}

7277

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7278

-new file mode 100644

7279

-index 0000000..0ed2746

7280

---- /dev/null

7281

-+++ b/block/bfq-iosched.c

7282

-@@ -0,0 +1,3082 @@

7283

-+/*

7284

-+ * BFQ, or Budget Fair Queueing, disk scheduler.

7285

-+ *

7286

-+ * Based on ideas and code from CFQ:

7287

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7288

-+ *

7289

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7290

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

7291

-+ *

7292

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7293

-+ *

7294

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

7295

-+ *

7296

-+ * BFQ is a proportional share disk scheduling algorithm based on the

7297

-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

7298

-+ * measured in number of sectors, to tasks instead of time slices.

7299

-+ * The disk is not granted to the active task for a given time slice,

7300

-+ * but until it has exahusted its assigned budget.  This change from

7301

-+ * the time to the service domain allows BFQ to distribute the disk

7302

-+ * bandwidth among tasks as desired, without any distortion due to

7303

-+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc

7304

-+ * internal scheduler, called B-WF2Q+, to schedule tasks according to

7305

-+ * their budgets.  Thanks to this accurate scheduler, BFQ can afford

7306

-+ * to assign high budgets to disk-bound non-seeky tasks (to boost the

7307

-+ * throughput), and yet guarantee low latencies to interactive and

7308

-+ * soft real-time applications.

7309

-+ *

7310

-+ * BFQ has been introduced in [1], where the interested reader can

7311

-+ * find an accurate description of the algorithm, the bandwidth

7312

-+ * distribution and latency guarantees it provides, plus formal proofs

7313

-+ * of all the properties.  With respect to the algorithm presented in

7314

-+ * the paper, this implementation adds several little heuristics, and

7315

-+ * a hierarchical extension, based on H-WF2Q+.

7316

-+ *

7317

-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

7318

-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

7319

-+ * complexity derives from the one introduced with EEVDF in [3].

7320

-+ *

7321

-+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling

7322

-+ *     with Deterministic Guarantees on Bandwidth Distribution,'',

7323

-+ *     IEEE Transactions on Computer, May 2010.

7324

-+ *

7325

-+ *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf

7326

-+ *

7327

-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

7328

-+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

7329

-+ *     Oct 1997.

7330

-+ *

7331

-+ *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

7332

-+ *

7333

-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

7334

-+ *     First: A Flexible and Accurate Mechanism for Proportional Share

7335

-+ *     Resource Allocation,'' technical report.

7336

-+ *

7337

-+ *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

7338

-+ */

7339

-+#include <linux/module.h>

7340

-+#include <linux/slab.h>

7341

-+#include <linux/blkdev.h>

7342

-+#include <linux/cgroup.h>

7343

-+#include <linux/elevator.h>

7344

-+#include <linux/jiffies.h>

7345

-+#include <linux/rbtree.h>

7346

-+#include <linux/ioprio.h>

7347

-+#include "bfq.h"

7348

-+#include "blk.h"

7349

-+

7350

-+/* Max number of dispatches in one round of service. */

7351

-+static const int bfq_quantum = 4;

7352

-+

7353

-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */

7354

-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

7355

-+

7356

-+/* Maximum backwards seek, in KiB. */

7357

-+static const int bfq_back_max = 16 * 1024;

7358

-+

7359

-+/* Penalty of a backwards seek, in number of sectors. */

7360

-+static const int bfq_back_penalty = 2;

7361

-+

7362

-+/* Idling period duration, in jiffies. */

7363

-+static int bfq_slice_idle = HZ / 125;

7364

-+

7365

-+/* Default maximum budget values, in sectors and number of requests. */

7366

-+static const int bfq_default_max_budget = 16 * 1024;

7367

-+static const int bfq_max_budget_async_rq = 4;

7368

-+

7369

-+/*

7370

-+ * Async to sync throughput distribution is controlled as follows:

7371

-+ * when an async request is served, the entity is charged the number

7372

-+ * of sectors of the request, multipled by the factor below

7373

-+ */

7374

-+static const int bfq_async_charge_factor = 10;

7375

-+

7376

-+/* Default timeout values, in jiffies, approximating CFQ defaults. */

7377

-+static const int bfq_timeout_sync = HZ / 8;

7378

-+static int bfq_timeout_async = HZ / 25;

7379

-+

7380

-+struct kmem_cache *bfq_pool;

7381

-+

7382

-+/* Below this threshold (in ms), we consider thinktime immediate. */

7383

-+#define BFQ_MIN_TT		2

7384

-+

7385

-+/* hw_tag detection: parallel requests threshold and min samples needed. */

7386

-+#define BFQ_HW_QUEUE_THRESHOLD	4

7387

-+#define BFQ_HW_QUEUE_SAMPLES	32

7388

-+

7389

-+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

7390

-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

7391

-+

7392

-+/* Min samples used for peak rate estimation (for autotuning). */

7393

-+#define BFQ_PEAK_RATE_SAMPLES	32

7394

-+

7395

-+/* Shift used for peak rate fixed precision calculations. */

7396

-+#define BFQ_RATE_SHIFT		16

7397

-+

7398

-+/*

7399

-+ * The duration of the weight raising for interactive applications is

7400

-+ * computed automatically (as default behaviour), using the following

7401

-+ * formula: duration = (R / r) * T, where r is the peak rate of the

7402

-+ * disk, and R and T are two reference parameters. In particular, R is

7403

-+ * the peak rate of a reference disk, and T is about the maximum time

7404

-+ * for starting popular large applications on that disk, under BFQ and

7405

-+ * while reading two files in parallel. Finally, BFQ uses two

7406

-+ * different pairs (R, T) depending on whether the disk is rotational

7407

-+ * or non-rotational.

7408

-+ */

7409

-+#define T_rot			(msecs_to_jiffies(5500))

7410

-+#define T_nonrot		(msecs_to_jiffies(2000))

7411

-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

7412

-+#define R_rot			17415

7413

-+#define R_nonrot		34791

7414

-+

7415

-+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

7416

-+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

7417

-+

7418

-+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

7419

-+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

7420

-+

7421

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

7422

-+

7423

-+#include "bfq-ioc.c"

7424

-+#include "bfq-sched.c"

7425

-+#include "bfq-cgroup.c"

7426

-+

7427

-+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

7428

-+				 IOPRIO_CLASS_IDLE)

7429

-+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

7430

-+				 IOPRIO_CLASS_RT)

7431

-+

7432

-+#define bfq_sample_valid(samples)	((samples) > 80)

7433

-+

7434

-+/*

7435

-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit

7436

-+ * set (in which case it could also be a direct WRITE).

7437

-+ */

7438

-+static inline int bfq_bio_sync(struct bio *bio)

7439

-+{

7440

-+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

7441

-+		return 1;

7442

-+

7443

-+	return 0;

7444

-+}

7445

-+

7446

-+/*

7447

-+ * Scheduler run of queue, if there are requests pending and no one in the

7448

-+ * driver that will restart queueing.

7449

-+ */

7450

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

7451

-+{

7452

-+	if (bfqd->queued != 0) {

7453

-+		bfq_log(bfqd, "schedule dispatch");

7454

-+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

7455

-+	}

7456

-+}

7457

-+

7458

-+/*

7459

-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

7460

-+ * We choose the request that is closesr to the head right now.  Distance

7461

-+ * behind the head is penalized and only allowed to a certain extent.

7462

-+ */

7463

-+static struct request *bfq_choose_req(struct bfq_data *bfqd,

7464

-+				      struct request *rq1,

7465

-+				      struct request *rq2,

7466

-+				      sector_t last)

7467

-+{

7468

-+	sector_t s1, s2, d1 = 0, d2 = 0;

7469

-+	unsigned long back_max;

7470

-+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

7471

-+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

7472

-+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

7473

-+

7474

-+	if (rq1 == NULL || rq1 == rq2)

7475

-+		return rq2;

7476

-+	if (rq2 == NULL)

7477

-+		return rq1;

7478

-+

7479

-+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

7480

-+		return rq1;

7481

-+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

7482

-+		return rq2;

7483

-+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

7484

-+		return rq1;

7485

-+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

7486

-+		return rq2;

7487

-+

7488

-+	s1 = blk_rq_pos(rq1);

7489

-+	s2 = blk_rq_pos(rq2);

7490

-+

7491

-+	/*

7492

-+	 * By definition, 1KiB is 2 sectors.

7493

-+	 */

7494

-+	back_max = bfqd->bfq_back_max * 2;

7495

-+

7496

-+	/*

7497

-+	 * Strict one way elevator _except_ in the case where we allow

7498

-+	 * short backward seeks which are biased as twice the cost of a

7499

-+	 * similar forward seek.

7500

-+	 */

7501

-+	if (s1 >= last)

7502

-+		d1 = s1 - last;

7503

-+	else if (s1 + back_max >= last)

7504

-+		d1 = (last - s1) * bfqd->bfq_back_penalty;

7505

-+	else

7506

-+		wrap |= BFQ_RQ1_WRAP;

7507

-+

7508

-+	if (s2 >= last)

7509

-+		d2 = s2 - last;

7510

-+	else if (s2 + back_max >= last)

7511

-+		d2 = (last - s2) * bfqd->bfq_back_penalty;

7512

-+	else

7513

-+		wrap |= BFQ_RQ2_WRAP;

7514

-+

7515

-+	/* Found required data */

7516

-+

7517

-+	/*

7518

-+	 * By doing switch() on the bit mask "wrap" we avoid having to

7519

-+	 * check two variables for all permutations: --> faster!

7520

-+	 */

7521

-+	switch (wrap) {

7522

-+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

7523

-+		if (d1 < d2)

7524

-+			return rq1;

7525

-+		else if (d2 < d1)

7526

-+			return rq2;

7527

-+		else {

7528

-+			if (s1 >= s2)

7529

-+				return rq1;

7530

-+			else

7531

-+				return rq2;

7532

-+		}

7533

-+

7534

-+	case BFQ_RQ2_WRAP:

7535

-+		return rq1;

7536

-+	case BFQ_RQ1_WRAP:

7537

-+		return rq2;

7538

-+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

7539

-+	default:

7540

-+		/*

7541

-+		 * Since both rqs are wrapped,

7542

-+		 * start with the one that's further behind head

7543

-+		 * (--> only *one* back seek required),

7544

-+		 * since back seek takes more time than forward.

7545

-+		 */

7546

-+		if (s1 <= s2)

7547

-+			return rq1;

7548

-+		else

7549

-+			return rq2;

7550

-+	}

7551

-+}

7552

-+

7553

-+static struct bfq_queue *

7554

-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

7555

-+		     sector_t sector, struct rb_node **ret_parent,

7556

-+		     struct rb_node ***rb_link)

7557

-+{

7558

-+	struct rb_node **p, *parent;

7559

-+	struct bfq_queue *bfqq = NULL;

7560

-+

7561

-+	parent = NULL;

7562

-+	p = &root->rb_node;

7563

-+	while (*p) {

7564

-+		struct rb_node **n;

7565

-+

7566

-+		parent = *p;

7567

-+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7568

-+

7569

-+		/*

7570

-+		 * Sort strictly based on sector. Smallest to the left,

7571

-+		 * largest to the right.

7572

-+		 */

7573

-+		if (sector > blk_rq_pos(bfqq->next_rq))

7574

-+			n = &(*p)->rb_right;

7575

-+		else if (sector < blk_rq_pos(bfqq->next_rq))

7576

-+			n = &(*p)->rb_left;

7577

-+		else

7578

-+			break;

7579

-+		p = n;

7580

-+		bfqq = NULL;

7581

-+	}

7582

-+

7583

-+	*ret_parent = parent;

7584

-+	if (rb_link)

7585

-+		*rb_link = p;

7586

-+

7587

-+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

7588

-+		(long long unsigned)sector,

7589

-+		bfqq != NULL ? bfqq->pid : 0);

7590

-+

7591

-+	return bfqq;

7592

-+}

7593

-+

7594

-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7595

-+{

7596

-+	struct rb_node **p, *parent;

7597

-+	struct bfq_queue *__bfqq;

7598

-+

7599

-+	if (bfqq->pos_root != NULL) {

7600

-+		rb_erase(&bfqq->pos_node, bfqq->pos_root);

7601

-+		bfqq->pos_root = NULL;

7602

-+	}

7603

-+

7604

-+	if (bfq_class_idle(bfqq))

7605

-+		return;

7606

-+	if (!bfqq->next_rq)

7607

-+		return;

7608

-+

7609

-+	bfqq->pos_root = &bfqd->rq_pos_tree;

7610

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

7611

-+			blk_rq_pos(bfqq->next_rq), &parent, &p);

7612

-+	if (__bfqq == NULL) {

7613

-+		rb_link_node(&bfqq->pos_node, parent, p);

7614

-+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

7615

-+	} else

7616

-+		bfqq->pos_root = NULL;

7617

-+}

7618

-+

7619

-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

7620

-+					struct bfq_queue *bfqq,

7621

-+					struct request *last)

7622

-+{

7623

-+	struct rb_node *rbnext = rb_next(&last->rb_node);

7624

-+	struct rb_node *rbprev = rb_prev(&last->rb_node);

7625

-+	struct request *next = NULL, *prev = NULL;

7626

-+

7627

-+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

7628

-+

7629

-+	if (rbprev != NULL)

7630

-+		prev = rb_entry_rq(rbprev);

7631

-+

7632

-+	if (rbnext != NULL)

7633

-+		next = rb_entry_rq(rbnext);

7634

-+	else {

7635

-+		rbnext = rb_first(&bfqq->sort_list);

7636

-+		if (rbnext && rbnext != &last->rb_node)

7637

-+			next = rb_entry_rq(rbnext);

7638

-+	}

7639

-+

7640

-+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

7641

-+}

7642

-+

7643

-+static void bfq_del_rq_rb(struct request *rq)

7644

-+{

7645

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7646

-+	struct bfq_data *bfqd = bfqq->bfqd;

7647

-+	const int sync = rq_is_sync(rq);

7648

-+

7649

-+	BUG_ON(bfqq->queued[sync] == 0);

7650

-+	bfqq->queued[sync]--;

7651

-+	bfqd->queued--;

7652

-+

7653

-+	elv_rb_del(&bfqq->sort_list, rq);

7654

-+

7655

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

7656

-+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)

7657

-+			bfq_del_bfqq_busy(bfqd, bfqq, 1);

7658

-+		/*

7659

-+		 * Remove queue from request-position tree as it is empty.

7660

-+		 */

7661

-+		if (bfqq->pos_root != NULL) {

7662

-+			rb_erase(&bfqq->pos_node, bfqq->pos_root);

7663

-+			bfqq->pos_root = NULL;

7664

-+		}

7665

-+	}

7666

-+}

7667

-+

7668

-+/* see the definition of bfq_async_charge_factor for details */

7669

-+static inline unsigned long bfq_serv_to_charge(struct request *rq,

7670

-+					       struct bfq_queue *bfqq)

7671

-+{

7672

-+	return blk_rq_sectors(rq) *

7673

-+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

7674

-+		bfq_async_charge_factor));

7675

-+}

7676

-+

7677

-+/**

7678

-+ * bfq_updated_next_req - update the queue after a new next_rq selection.

7679

-+ * @bfqd: the device data the queue belongs to.

7680

-+ * @bfqq: the queue to update.

7681

-+ *

7682

-+ * If the first request of a queue changes we make sure that the queue

7683

-+ * has enough budget to serve at least its first request (if the

7684

-+ * request has grown).  We do this because if the queue has not enough

7685

-+ * budget for its first request, it has to go through two dispatch

7686

-+ * rounds to actually get it dispatched.

7687

-+ */

7688

-+static void bfq_updated_next_req(struct bfq_data *bfqd,

7689

-+				 struct bfq_queue *bfqq)

7690

-+{

7691

-+	struct bfq_entity *entity = &bfqq->entity;

7692

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

7693

-+	struct request *next_rq = bfqq->next_rq;

7694

-+	unsigned long new_budget;

7695

-+

7696

-+	if (next_rq == NULL)

7697

-+		return;

7698

-+

7699

-+	if (bfqq == bfqd->active_queue)

7700

-+		/*

7701

-+		 * In order not to break guarantees, budgets cannot be

7702

-+		 * changed after an entity has been selected.

7703

-+		 */

7704

-+		return;

7705

-+

7706

-+	BUG_ON(entity->tree != &st->active);

7707

-+	BUG_ON(entity == entity->sched_data->active_entity);

7708

-+

7709

-+	new_budget = max_t(unsigned long, bfqq->max_budget,

7710

-+			   bfq_serv_to_charge(next_rq, bfqq));

7711

-+	entity->budget = new_budget;

7712

-+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

7713

-+	bfq_activate_bfqq(bfqd, bfqq);

7714

-+}

7715

-+

7716

-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

7717

-+{

7718

-+	u64 dur;

7719

-+

7720

-+	if (bfqd->bfq_raising_max_time > 0)

7721

-+		return bfqd->bfq_raising_max_time;

7722

-+

7723

-+	dur = bfqd->RT_prod;

7724

-+	do_div(dur, bfqd->peak_rate);

7725

-+

7726

-+	return dur;

7727

-+}

7728

-+

7729

-+static void bfq_add_rq_rb(struct request *rq)

7730

-+{

7731

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7732

-+	struct bfq_entity *entity = &bfqq->entity;

7733

-+	struct bfq_data *bfqd = bfqq->bfqd;

7734

-+	struct request *next_rq, *prev;

7735

-+	unsigned long old_raising_coeff = bfqq->raising_coeff;

7736

-+	int idle_for_long_time = bfqq->budget_timeout +

7737

-+		bfqd->bfq_raising_min_idle_time < jiffies;

7738

-+

7739

-+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

7740

-+	bfqq->queued[rq_is_sync(rq)]++;

7741

-+	bfqd->queued++;

7742

-+

7743

-+	elv_rb_add(&bfqq->sort_list, rq);

7744

-+

7745

-+	/*

7746

-+	 * Check if this request is a better next-serve candidate.

7747

-+	 */

7748

-+	prev = bfqq->next_rq;

7749

-+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

7750

-+	BUG_ON(next_rq == NULL);

7751

-+	bfqq->next_rq = next_rq;

7752

-+

7753

-+	/*

7754

-+	 * Adjust priority tree position, if next_rq changes.

7755

-+	 */

7756

-+	if (prev != bfqq->next_rq)

7757

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

7758

-+

7759

-+	if (!bfq_bfqq_busy(bfqq)) {

7760

-+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

7761

-+			bfqq->soft_rt_next_start < jiffies;

7762

-+		entity->budget = max_t(unsigned long, bfqq->max_budget,

7763

-+				       bfq_serv_to_charge(next_rq, bfqq));

7764

-+

7765

-+		if (! bfqd->low_latency)

7766

-+			goto add_bfqq_busy;

7767

-+

7768

-+		/*

7769

-+		 * If the queue is not being boosted and has been idle

7770

-+		 * for enough time, start a weight-raising period

7771

-+		 */

7772

-+		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

7773

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7774

-+			if (idle_for_long_time)

7775

-+				bfqq->raising_cur_max_time =

7776

-+					bfq_wrais_duration(bfqd);

7777

-+			else

7778

-+				bfqq->raising_cur_max_time =

7779

-+					bfqd->bfq_raising_rt_max_time;

7780

-+			bfq_log_bfqq(bfqd, bfqq,

7781

-+				     "wrais starting at %llu msec,"

7782

-+				     "rais_max_time %u",

7783

-+				     bfqq->last_rais_start_finish,

7784

-+				     jiffies_to_msecs(bfqq->

7785

-+					raising_cur_max_time));

7786

-+		} else if (old_raising_coeff > 1) {

7787

-+			if (idle_for_long_time)

7788

-+				bfqq->raising_cur_max_time =

7789

-+					bfq_wrais_duration(bfqd);

7790

-+			else if (bfqq->raising_cur_max_time ==

7791

-+				 bfqd->bfq_raising_rt_max_time &&

7792

-+				 !soft_rt) {

7793

-+				bfqq->raising_coeff = 1;

7794

-+				bfq_log_bfqq(bfqd, bfqq,

7795

-+					     "wrais ending at %llu msec,"

7796

-+					     "rais_max_time %u",

7797

-+					     bfqq->last_rais_start_finish,

7798

-+					     jiffies_to_msecs(bfqq->

7799

-+						raising_cur_max_time));

7800

-+				}

7801

-+		}

7802

-+		if (old_raising_coeff != bfqq->raising_coeff)

7803

-+			entity->ioprio_changed = 1;

7804

-+add_bfqq_busy:

7805

-+		bfq_add_bfqq_busy(bfqd, bfqq);

7806

-+        } else {

7807

-+                if(bfqd->low_latency && old_raising_coeff == 1 &&

7808

-+			!rq_is_sync(rq) &&

7809

-+			bfqq->last_rais_start_finish +

7810

-+                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {

7811

-+                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7812

-+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

7813

-+

7814

-+			entity->ioprio_changed = 1;

7815

-+			bfq_log_bfqq(bfqd, bfqq,

7816

-+				     "non-idle wrais starting at %llu msec,"

7817

-+				     "rais_max_time %u",

7818

-+				     bfqq->last_rais_start_finish,

7819

-+				     jiffies_to_msecs(bfqq->

7820

-+					raising_cur_max_time));

7821

-+                }

7822

-+                bfq_updated_next_req(bfqd, bfqq);

7823

-+	}

7824

-+

7825

-+	if(bfqd->low_latency &&

7826

-+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

7827

-+		 idle_for_long_time))

7828

-+		bfqq->last_rais_start_finish = jiffies;

7829

-+}

7830

-+

7831

-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

7832

-+{

7833

-+	elv_rb_del(&bfqq->sort_list, rq);

7834

-+	bfqq->queued[rq_is_sync(rq)]--;

7835

-+	bfqq->bfqd->queued--;

7836

-+	bfq_add_rq_rb(rq);

7837

-+}

7838

-+

7839

-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

7840

-+					  struct bio *bio)

7841

-+{

7842

-+	struct task_struct *tsk = current;

7843

-+	struct bfq_io_cq *bic;

7844

-+	struct bfq_queue *bfqq;

7845

-+

7846

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

7847

-+	if (bic == NULL)

7848

-+		return NULL;

7849

-+

7850

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7851

-+	if (bfqq != NULL) {

7852

-+		sector_t sector = bio->bi_sector + bio_sectors(bio);

7853

-+

7854

-+		return elv_rb_find(&bfqq->sort_list, sector);

7855

-+	}

7856

-+

7857

-+	return NULL;

7858

-+}

7859

-+

7860

-+static void bfq_activate_request(struct request_queue *q, struct request *rq)

7861

-+{

7862

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7863

-+

7864

-+	bfqd->rq_in_driver++;

7865

-+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

7866

-+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

7867

-+		(long long unsigned)bfqd->last_position);

7868

-+}

7869

-+

7870

-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

7871

-+{

7872

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7873

-+

7874

-+	WARN_ON(bfqd->rq_in_driver == 0);

7875

-+	bfqd->rq_in_driver--;

7876

-+}

7877

-+

7878

-+static void bfq_remove_request(struct request *rq)

7879

-+{

7880

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7881

-+	struct bfq_data *bfqd = bfqq->bfqd;

7882

-+

7883

-+	if (bfqq->next_rq == rq) {

7884

-+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

7885

-+		bfq_updated_next_req(bfqd, bfqq);

7886

-+	}

7887

-+

7888

-+	list_del_init(&rq->queuelist);

7889

-+	bfq_del_rq_rb(rq);

7890

-+

7891

-+	if (rq->cmd_flags & REQ_META) {

7892

-+		WARN_ON(bfqq->meta_pending == 0);

7893

-+		bfqq->meta_pending--;

7894

-+	}

7895

-+}

7896

-+

7897

-+static int bfq_merge(struct request_queue *q, struct request **req,

7898

-+		     struct bio *bio)

7899

-+{

7900

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7901

-+	struct request *__rq;

7902

-+

7903

-+	__rq = bfq_find_rq_fmerge(bfqd, bio);

7904

-+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

7905

-+		*req = __rq;

7906

-+		return ELEVATOR_FRONT_MERGE;

7907

-+	}

7908

-+

7909

-+	return ELEVATOR_NO_MERGE;

7910

-+}

7911

-+

7912

-+static void bfq_merged_request(struct request_queue *q, struct request *req,

7913

-+			       int type)

7914

-+{

7915

-+	if (type == ELEVATOR_FRONT_MERGE) {

7916

-+		struct bfq_queue *bfqq = RQ_BFQQ(req);

7917

-+

7918

-+		bfq_reposition_rq_rb(bfqq, req);

7919

-+	}

7920

-+}

7921

-+

7922

-+static void bfq_merged_requests(struct request_queue *q, struct request *rq,

7923

-+				struct request *next)

7924

-+{

7925

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7926

-+

7927

-+	/*

7928

-+	 * Reposition in fifo if next is older than rq.

7929

-+	 */

7930

-+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

7931

-+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

7932

-+		list_move(&rq->queuelist, &next->queuelist);

7933

-+		rq_set_fifo_time(rq, rq_fifo_time(next));

7934

-+	}

7935

-+

7936

-+	if (bfqq->next_rq == next)

7937

-+		bfqq->next_rq = rq;

7938

-+

7939

-+	bfq_remove_request(next);

7940

-+}

7941

-+

7942

-+/* Must be called with bfqq != NULL */

7943

-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

7944

-+{

7945

-+	BUG_ON(bfqq == NULL);

7946

-+	bfqq->raising_coeff = 1;

7947

-+	bfqq->raising_cur_max_time = 0;

7948

-+	/* Trigger a weight change on the next activation of the queue */

7949

-+	bfqq->entity.ioprio_changed = 1;

7950

-+}

7951

-+

7952

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

7953

-+					struct bfq_group *bfqg)

7954

-+{

7955

-+	int i, j;

7956

-+

7957

-+	for (i = 0; i < 2; i++)

7958

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

7959

-+			if (bfqg->async_bfqq[i][j] != NULL)

7960

-+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

7961

-+	if (bfqg->async_idle_bfqq != NULL)

7962

-+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

7963

-+}

7964

-+

7965

-+static void bfq_end_raising(struct bfq_data *bfqd)

7966

-+{

7967

-+	struct bfq_queue *bfqq;

7968

-+

7969

-+	spin_lock_irq(bfqd->queue->queue_lock);

7970

-+

7971

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

7972

-+		bfq_bfqq_end_raising(bfqq);

7973

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

7974

-+		bfq_bfqq_end_raising(bfqq);

7975

-+	bfq_end_raising_async(bfqd);

7976

-+

7977

-+	spin_unlock_irq(bfqd->queue->queue_lock);

7978

-+}

7979

-+

7980

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7981

-+			   struct bio *bio)

7982

-+{

7983

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7984

-+	struct bfq_io_cq *bic;

7985

-+	struct bfq_queue *bfqq;

7986

-+

7987

-+	/*

7988

-+	 * Disallow merge of a sync bio into an async request.

7989

-+	 */

7990

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7991

-+		return 0;

7992

-+

7993

-+	/*

7994

-+	 * Lookup the bfqq that this bio will be queued with. Allow

7995

-+	 * merge only if rq is queued there.

7996

-+	 * Queue lock is held here.

7997

-+	 */

7998

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

7999

-+	if (bic == NULL)

8000

-+		return 0;

8001

-+

8002

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

8003

-+	return bfqq == RQ_BFQQ(rq);

8004

-+}

8005

-+

8006

-+static void __bfq_set_active_queue(struct bfq_data *bfqd,

8007

-+				   struct bfq_queue *bfqq)

8008

-+{

8009

-+	if (bfqq != NULL) {

8010

-+		bfq_mark_bfqq_must_alloc(bfqq);

8011

-+		bfq_mark_bfqq_budget_new(bfqq);

8012

-+		bfq_clear_bfqq_fifo_expire(bfqq);

8013

-+

8014

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

8015

-+

8016

-+		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

8017

-+			     bfqq->entity.budget);

8018

-+	}

8019

-+

8020

-+	bfqd->active_queue = bfqq;

8021

-+}

8022

-+

8023

-+/*

8024

-+ * Get and set a new active queue for service.

8025

-+ */

8026

-+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

8027

-+					      struct bfq_queue *bfqq)

8028

-+{

8029

-+	if (!bfqq)

8030

-+		bfqq = bfq_get_next_queue(bfqd);

8031

-+	else

8032

-+		bfq_get_next_queue_forced(bfqd, bfqq);

8033

-+

8034

-+	__bfq_set_active_queue(bfqd, bfqq);

8035

-+	return bfqq;

8036

-+}

8037

-+

8038

-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

8039

-+					  struct request *rq)

8040

-+{

8041

-+	if (blk_rq_pos(rq) >= bfqd->last_position)

8042

-+		return blk_rq_pos(rq) - bfqd->last_position;

8043

-+	else

8044

-+		return bfqd->last_position - blk_rq_pos(rq);

8045

-+}

8046

-+

8047

-+/*

8048

-+ * Return true if bfqq has no request pending and rq is close enough to

8049

-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than

8050

-+ * bfqq->next_rq

8051

-+ */

8052

-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

8053

-+{

8054

-+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

8055

-+}

8056

-+

8057

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

8058

-+{

8059

-+	struct rb_root *root = &bfqd->rq_pos_tree;

8060

-+	struct rb_node *parent, *node;

8061

-+	struct bfq_queue *__bfqq;

8062

-+	sector_t sector = bfqd->last_position;

8063

-+

8064

-+	if (RB_EMPTY_ROOT(root))

8065

-+		return NULL;

8066

-+

8067

-+	/*

8068

-+	 * First, if we find a request starting at the end of the last

8069

-+	 * request, choose it.

8070

-+	 */

8071

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

8072

-+	if (__bfqq != NULL)

8073

-+		return __bfqq;

8074

-+

8075

-+	/*

8076

-+	 * If the exact sector wasn't found, the parent of the NULL leaf

8077

-+	 * will contain the closest sector (rq_pos_tree sorted by next_request

8078

-+	 * position).

8079

-+	 */

8080

-+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

8081

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8082

-+		return __bfqq;

8083

-+

8084

-+	if (blk_rq_pos(__bfqq->next_rq) < sector)

8085

-+		node = rb_next(&__bfqq->pos_node);

8086

-+	else

8087

-+		node = rb_prev(&__bfqq->pos_node);

8088

-+	if (node == NULL)

8089

-+		return NULL;

8090

-+

8091

-+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

8092

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8093

-+		return __bfqq;

8094

-+

8095

-+	return NULL;

8096

-+}

8097

-+

8098

-+/*

8099

-+ * bfqd - obvious

8100

-+ * cur_bfqq - passed in so that we don't decide that the current queue

8101

-+ *            is closely cooperating with itself.

8102

-+ *

8103

-+ * We are assuming that cur_bfqq has dispatched at least one request,

8104

-+ * and that bfqd->last_position reflects a position on the disk associated

8105

-+ * with the I/O issued by cur_bfqq.

8106

-+ */

8107

-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

8108

-+					      struct bfq_queue *cur_bfqq)

8109

-+{

8110

-+	struct bfq_queue *bfqq;

8111

-+

8112

-+	if (bfq_class_idle(cur_bfqq))

8113

-+		return NULL;

8114

-+	if (!bfq_bfqq_sync(cur_bfqq))

8115

-+		return NULL;

8116

-+	if (BFQQ_SEEKY(cur_bfqq))

8117

-+		return NULL;

8118

-+

8119

-+	/* If device has only one backlogged bfq_queue, don't search. */

8120

-+	if (bfqd->busy_queues == 1)

8121

-+		return NULL;

8122

-+

8123

-+	/*

8124

-+	 * We should notice if some of the queues are cooperating, e.g.

8125

-+	 * working closely on the same area of the disk. In that case,

8126

-+	 * we can group them together and don't waste time idling.

8127

-+	 */

8128

-+	bfqq = bfqq_close(bfqd);

8129

-+	if (bfqq == NULL || bfqq == cur_bfqq)

8130

-+		return NULL;

8131

-+

8132

-+	/*

8133

-+	 * Do not merge queues from different bfq_groups.

8134

-+	*/

8135

-+	if (bfqq->entity.parent != cur_bfqq->entity.parent)

8136

-+		return NULL;

8137

-+

8138

-+	/*

8139

-+	 * It only makes sense to merge sync queues.

8140

-+	 */

8141

-+	if (!bfq_bfqq_sync(bfqq))

8142

-+		return NULL;

8143

-+	if (BFQQ_SEEKY(bfqq))

8144

-+		return NULL;

8145

-+

8146

-+	/*

8147

-+	 * Do not merge queues of different priority classes.

8148

-+	 */

8149

-+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

8150

-+		return NULL;

8151

-+

8152

-+	return bfqq;

8153

-+}

8154

-+

8155

-+/*

8156

-+ * If enough samples have been computed, return the current max budget

8157

-+ * stored in bfqd, which is dynamically updated according to the

8158

-+ * estimated disk peak rate; otherwise return the default max budget

8159

-+ */

8160

-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

8161

-+{

8162

-+	if (bfqd->budgets_assigned < 194)

8163

-+		return bfq_default_max_budget;

8164

-+	else

8165

-+		return bfqd->bfq_max_budget;

8166

-+}

8167

-+

8168

-+/*

8169

-+ * Return min budget, which is a fraction of the current or default

8170

-+ * max budget (trying with 1/32)

8171

-+ */

8172

-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

8173

-+{

8174

-+	if (bfqd->budgets_assigned < 194)

8175

-+		return bfq_default_max_budget / 32;

8176

-+	else

8177

-+		return bfqd->bfq_max_budget / 32;

8178

-+}

8179

-+

8180

-+/*

8181

-+ * Decides whether idling should be done for given device and

8182

-+ * given active queue.

8183

-+ */

8184

-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

8185

-+					   struct bfq_queue *active_bfqq)

8186

-+{

8187

-+	if (active_bfqq == NULL)

8188

-+		return false;

8189

-+	/*

8190

-+	 * If device is SSD it has no seek penalty, disable idling; but

8191

-+	 * do so only if:

8192

-+	 * - device does not support queuing, otherwise we still have

8193

-+	 *   a problem with sync vs async workloads;

8194

-+	 * - the queue is not weight-raised, to preserve guarantees.

8195

-+	 */

8196

-+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

8197

-+		active_bfqq->raising_coeff == 1);

8198

-+}

8199

-+

8200

-+static void bfq_arm_slice_timer(struct bfq_data *bfqd)

8201

-+{

8202

-+	struct bfq_queue *bfqq = bfqd->active_queue;

8203

-+	struct bfq_io_cq *bic;

8204

-+	unsigned long sl;

8205

-+

8206

-+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

8207

-+

8208

-+	/* Tasks have exited, don't wait. */

8209

-+	bic = bfqd->active_bic;

8210

-+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

8211

-+		return;

8212

-+

8213

-+	bfq_mark_bfqq_wait_request(bfqq);

8214

-+

8215

-+	/*

8216

-+	 * We don't want to idle for seeks, but we do want to allow

8217

-+	 * fair distribution of slice time for a process doing back-to-back

8218

-+	 * seeks. So allow a little bit of time for him to submit a new rq.

8219

-+	 *

8220

-+	 * To prevent processes with (partly) seeky workloads from

8221

-+	 * being too ill-treated, grant them a small fraction of the

8222

-+	 * assigned budget before reducing the waiting time to

8223

-+	 * BFQ_MIN_TT. This happened to help reduce latency.

8224

-+	 */

8225

-+	sl = bfqd->bfq_slice_idle;

8226

-+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

8227

-+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

8228

-+	    bfqq->raising_coeff == 1)

8229

-+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

8230

-+	else if (bfqq->raising_coeff > 1)

8231

-+		sl = sl * 3;

8232

-+	bfqd->last_idling_start = ktime_get();

8233

-+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

8234

-+	bfq_log(bfqd, "arm idle: %u/%u ms",

8235

-+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

8236

-+}

8237

-+

8238

-+/*

8239

-+ * Set the maximum time for the active queue to consume its

8240

-+ * budget. This prevents seeky processes from lowering the disk

8241

-+ * throughput (always guaranteed with a time slice scheme as in CFQ).

8242

-+ */

8243

-+static void bfq_set_budget_timeout(struct bfq_data *bfqd)

8244

-+{

8245

-+	struct bfq_queue *bfqq = bfqd->active_queue;

8246

-+	unsigned int timeout_coeff;

8247

-+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

8248

-+		timeout_coeff = 1;

8249

-+	else

8250

-+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

8251

-+

8252

-+	bfqd->last_budget_start = ktime_get();

8253

-+

8254

-+	bfq_clear_bfqq_budget_new(bfqq);

8255

-+	bfqq->budget_timeout = jiffies +

8256

-+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

8257

-+

8258

-+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

8259

-+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

8260

-+		timeout_coeff));

8261

-+}

8262

-+

8263

-+/*

8264

-+ * Move request from internal lists to the request queue dispatch list.

8265

-+ */

8266

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

8267

-+{

8268

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

8269

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8270

-+

8271

-+	bfq_remove_request(rq);

8272

-+	bfqq->dispatched++;

8273

-+	elv_dispatch_sort(q, rq);

8274

-+

8275

-+	if (bfq_bfqq_sync(bfqq))

8276

-+		bfqd->sync_flight++;

8277

-+}

8278

-+

8279

-+/*

8280

-+ * Return expired entry, or NULL to just start from scratch in rbtree.

8281

-+ */

8282

-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

8283

-+{

8284

-+	struct request *rq = NULL;

8285

-+

8286

-+	if (bfq_bfqq_fifo_expire(bfqq))

8287

-+		return NULL;

8288

-+

8289

-+	bfq_mark_bfqq_fifo_expire(bfqq);

8290

-+

8291

-+	if (list_empty(&bfqq->fifo))

8292

-+		return NULL;

8293

-+

8294

-+	rq = rq_entry_fifo(bfqq->fifo.next);

8295

-+

8296

-+	if (time_before(jiffies, rq_fifo_time(rq)))

8297

-+		return NULL;

8298

-+

8299

-+	return rq;

8300

-+}

8301

-+

8302

-+/*

8303

-+ * Must be called with the queue_lock held.

8304

-+ */

8305

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

8306

-+{

8307

-+	int process_refs, io_refs;

8308

-+

8309

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

8310

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

8311

-+	BUG_ON(process_refs < 0);

8312

-+	return process_refs;

8313

-+}

8314

-+

8315

-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

8316

-+{

8317

-+	int process_refs, new_process_refs;

8318

-+	struct bfq_queue *__bfqq;

8319

-+

8320

-+	/*

8321

-+	 * If there are no process references on the new_bfqq, then it is

8322

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

8323

-+	 * may have dropped their last reference (not just their last process

8324

-+	 * reference).

8325

-+	 */

8326

-+	if (!bfqq_process_refs(new_bfqq))

8327

-+		return;

8328

-+

8329

-+	/* Avoid a circular list and skip interim queue merges. */

8330

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

8331

-+		if (__bfqq == bfqq)

8332

-+			return;

8333

-+		new_bfqq = __bfqq;

8334

-+	}

8335

-+

8336

-+	process_refs = bfqq_process_refs(bfqq);

8337

-+	new_process_refs = bfqq_process_refs(new_bfqq);

8338

-+	/*

8339

-+	 * If the process for the bfqq has gone away, there is no

8340

-+	 * sense in merging the queues.

8341

-+	 */

8342

-+	if (process_refs == 0 || new_process_refs == 0)

8343

-+		return;

8344

-+

8345

-+	/*

8346

-+	 * Merge in the direction of the lesser amount of work.

8347

-+	 */

8348

-+	if (new_process_refs >= process_refs) {

8349

-+		bfqq->new_bfqq = new_bfqq;

8350

-+		atomic_add(process_refs, &new_bfqq->ref);

8351

-+	} else {

8352

-+		new_bfqq->new_bfqq = bfqq;

8353

-+		atomic_add(new_process_refs, &bfqq->ref);

8354

-+	}

8355

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

8356

-+		new_bfqq->pid);

8357

-+}

8358

-+

8359

-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

8360

-+{

8361

-+	struct bfq_entity *entity = &bfqq->entity;

8362

-+	return entity->budget - entity->service;

8363

-+}

8364

-+

8365

-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8366

-+{

8367

-+	BUG_ON(bfqq != bfqd->active_queue);

8368

-+

8369

-+	__bfq_bfqd_reset_active(bfqd);

8370

-+

8371

-+	/*

8372

-+	 * If this bfqq is shared between multiple processes, check

8373

-+	 * to make sure that those processes are still issuing I/Os

8374

-+	 * within the mean seek distance. If not, it may be time to

8375

-+	 * break the queues apart again.

8376

-+	 */

8377

-+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

8378

-+		bfq_mark_bfqq_split_coop(bfqq);

8379

-+

8380

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

8381

-+		/*

8382

-+		 * overloading budget_timeout field to store when

8383

-+		 * the queue remains with no backlog, used by

8384

-+		 * the weight-raising mechanism

8385

-+		 */

8386

-+		bfqq->budget_timeout = jiffies ;

8387

-+		bfq_del_bfqq_busy(bfqd, bfqq, 1);

8388

-+	} else {

8389

-+		bfq_activate_bfqq(bfqd, bfqq);

8390

-+		/*

8391

-+		 * Resort priority tree of potential close cooperators.

8392

-+		 */

8393

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

8394

-+	}

8395

-+}

8396

-+

8397

-+/**

8398

-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

8399

-+ * @bfqd: device data.

8400

-+ * @bfqq: queue to update.

8401

-+ * @reason: reason for expiration.

8402

-+ *

8403

-+ * Handle the feedback on @bfqq budget.  See the body for detailed

8404

-+ * comments.

8405

-+ */

8406

-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

8407

-+				     struct bfq_queue *bfqq,

8408

-+				     enum bfqq_expiration reason)

8409

-+{

8410

-+	struct request *next_rq;

8411

-+	unsigned long budget, min_budget;

8412

-+

8413

-+	budget = bfqq->max_budget;

8414

-+	min_budget = bfq_min_budget(bfqd);

8415

-+

8416

-+	BUG_ON(bfqq != bfqd->active_queue);

8417

-+

8418

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

8419

-+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

8420

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

8421

-+		budget, bfq_min_budget(bfqd));

8422

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

8423

-+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));

8424

-+

8425

-+	if (bfq_bfqq_sync(bfqq)) {

8426

-+		switch (reason) {

8427

-+		/*

8428

-+		 * Caveat: in all the following cases we trade latency

8429

-+		 * for throughput.

8430

-+		 */

8431

-+		case BFQ_BFQQ_TOO_IDLE:

8432

-+			/*

8433

-+			 * This is the only case where we may reduce

8434

-+			 * the budget: if there is no requets of the

8435

-+			 * process still waiting for completion, then

8436

-+			 * we assume (tentatively) that the timer has

8437

-+			 * expired because the batch of requests of

8438

-+			 * the process could have been served with a

8439

-+			 * smaller budget.  Hence, betting that

8440

-+			 * process will behave in the same way when it

8441

-+			 * becomes backlogged again, we reduce its

8442

-+			 * next budget.  As long as we guess right,

8443

-+			 * this budget cut reduces the latency

8444

-+			 * experienced by the process.

8445

-+			 *

8446

-+			 * However, if there are still outstanding

8447

-+			 * requests, then the process may have not yet

8448

-+			 * issued its next request just because it is

8449

-+			 * still waiting for the completion of some of

8450

-+			 * the still oustanding ones.  So in this

8451

-+			 * subcase we do not reduce its budget, on the

8452

-+			 * contrary we increase it to possibly boost

8453

-+			 * the throughput, as discussed in the

8454

-+			 * comments to the BUDGET_TIMEOUT case.

8455

-+			 */

8456

-+			if (bfqq->dispatched > 0) /* still oustanding reqs */

8457

-+				budget = min(budget * 2, bfqd->bfq_max_budget);

8458

-+			else {

8459

-+				if (budget > 5 * min_budget)

8460

-+					budget -= 4 * min_budget;

8461

-+				else

8462

-+					budget = min_budget;

8463

-+			}

8464

-+			break;

8465

-+		case BFQ_BFQQ_BUDGET_TIMEOUT:

8466

-+			/*

8467

-+			 * We double the budget here because: 1) it

8468

-+			 * gives the chance to boost the throughput if

8469

-+			 * this is not a seeky process (which may have

8470

-+			 * bumped into this timeout because of, e.g.,

8471

-+			 * ZBR), 2) together with charge_full_budget

8472

-+			 * it helps give seeky processes higher

8473

-+			 * timestamps, and hence be served less

8474

-+			 * frequently.

8475

-+			 */

8476

-+			budget = min(budget * 2, bfqd->bfq_max_budget);

8477

-+			break;

8478

-+		case BFQ_BFQQ_BUDGET_EXHAUSTED:

8479

-+			/*

8480

-+			 * The process still has backlog, and did not

8481

-+			 * let either the budget timeout or the disk

8482

-+			 * idling timeout expire. Hence it is not

8483

-+			 * seeky, has a short thinktime and may be

8484

-+			 * happy with a higher budget too. So

8485

-+			 * definitely increase the budget of this good

8486

-+			 * candidate to boost the disk throughput.

8487

-+			 */

8488

-+			budget = min(budget * 4, bfqd->bfq_max_budget);

8489

-+			break;

8490

-+		case BFQ_BFQQ_NO_MORE_REQUESTS:

8491

-+		       /*

8492

-+			* Leave the budget unchanged.

8493

-+			*/

8494

-+		default:

8495

-+			return;

8496

-+		}

8497

-+	} else /* async queue */

8498

-+	    /* async queues get always the maximum possible budget

8499

-+	     * (their ability to dispatch is limited by

8500

-+	     * @bfqd->bfq_max_budget_async_rq).

8501

-+	     */

8502

-+		budget = bfqd->bfq_max_budget;

8503

-+

8504

-+	bfqq->max_budget = budget;

8505

-+

8506

-+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

8507

-+	    bfqq->max_budget > bfqd->bfq_max_budget)

8508

-+		bfqq->max_budget = bfqd->bfq_max_budget;

8509

-+

8510

-+	/*

8511

-+	 * Make sure that we have enough budget for the next request.

8512

-+	 * Since the finish time of the bfqq must be kept in sync with

8513

-+	 * the budget, be sure to call __bfq_bfqq_expire() after the

8514

-+	 * update.

8515

-+	 */

8516

-+	next_rq = bfqq->next_rq;

8517

-+	if (next_rq != NULL)

8518

-+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

8519

-+					    bfq_serv_to_charge(next_rq, bfqq));

8520

-+	else

8521

-+		bfqq->entity.budget = bfqq->max_budget;

8522

-+

8523

-+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

8524

-+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

8525

-+			bfqq->entity.budget);

8526

-+}

8527

-+

8528

-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

8529

-+{

8530

-+	unsigned long max_budget;

8531

-+

8532

-+	/*

8533

-+	 * The max_budget calculated when autotuning is equal to the

8534

-+	 * amount of sectors transfered in timeout_sync at the

8535

-+	 * estimated peak rate.

8536

-+	 */

8537

-+	max_budget = (unsigned long)(peak_rate * 1000 *

8538

-+				     timeout >> BFQ_RATE_SHIFT);

8539

-+

8540

-+	return max_budget;

8541

-+}

8542

-+

8543

-+/*

8544

-+ * In addition to updating the peak rate, checks whether the process

8545

-+ * is "slow", and returns 1 if so. This slow flag is used, in addition

8546

-+ * to the budget timeout, to reduce the amount of service provided to

8547

-+ * seeky processes, and hence reduce their chances to lower the

8548

-+ * throughput. See the code for more details.

8549

-+ */

8550

-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8551

-+				int compensate, enum bfqq_expiration reason)

8552

-+{

8553

-+	u64 bw, usecs, expected, timeout;

8554

-+	ktime_t delta;

8555

-+	int update = 0;

8556

-+

8557

-+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

8558

-+		return 0;

8559

-+

8560

-+	if (compensate)

8561

-+		delta = bfqd->last_idling_start;

8562

-+	else

8563

-+		delta = ktime_get();

8564

-+	delta = ktime_sub(delta, bfqd->last_budget_start);

8565

-+	usecs = ktime_to_us(delta);

8566

-+

8567

-+	/* Don't trust short/unrealistic values. */

8568

-+	if (usecs < 100 || usecs >= LONG_MAX)

8569

-+		return 0;

8570

-+

8571

-+	/*

8572

-+	 * Calculate the bandwidth for the last slice.  We use a 64 bit

8573

-+	 * value to store the peak rate, in sectors per usec in fixed

8574

-+	 * point math.  We do so to have enough precision in the estimate

8575

-+	 * and to avoid overflows.

8576

-+	 */

8577

-+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

8578

-+	do_div(bw, (unsigned long)usecs);

8579

-+

8580

-+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

8581

-+

8582

-+	/*

8583

-+	 * Use only long (> 20ms) intervals to filter out spikes for

8584

-+	 * the peak rate estimation.

8585

-+	 */

8586

-+	if (usecs > 20000) {

8587

-+		if (bw > bfqd->peak_rate ||

8588

-+		   (!BFQQ_SEEKY(bfqq) &&

8589

-+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

8590

-+			bfq_log(bfqd, "measured bw =%llu", bw);

8591

-+			/*

8592

-+			 * To smooth oscillations use a low-pass filter with

8593

-+			 * alpha=7/8, i.e.,

8594

-+			 * new_rate = (7/8) * old_rate + (1/8) * bw

8595

-+			 */

8596

-+			do_div(bw, 8);

8597

-+			if (bw == 0)

8598

-+				return 0;

8599

-+			bfqd->peak_rate *= 7;

8600

-+			do_div(bfqd->peak_rate, 8);

8601

-+			bfqd->peak_rate += bw;

8602

-+			update = 1;

8603

-+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

8604

-+		}

8605

-+

8606

-+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

8607

-+

8608

-+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

8609

-+			bfqd->peak_rate_samples++;

8610

-+

8611

-+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

8612

-+		    update && bfqd->bfq_user_max_budget == 0) {

8613

-+			bfqd->bfq_max_budget =

8614

-+				bfq_calc_max_budget(bfqd->peak_rate, timeout);

8615

-+			bfq_log(bfqd, "new max_budget=%lu",

8616

-+				bfqd->bfq_max_budget);

8617

-+		}

8618

-+	}

8619

-+

8620

-+	/*

8621

-+	 * If the process has been served for a too short time

8622

-+	 * interval to let its possible sequential accesses prevail on

8623

-+	 * the initial seek time needed to move the disk head on the

8624

-+	 * first sector it requested, then give the process a chance

8625

-+	 * and for the moment return false.

8626

-+	 */

8627

-+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

8628

-+		return 0;

8629

-+

8630

-+	/*

8631

-+	 * A process is considered ``slow'' (i.e., seeky, so that we

8632

-+	 * cannot treat it fairly in the service domain, as it would

8633

-+	 * slow down too much the other processes) if, when a slice

8634

-+	 * ends for whatever reason, it has received service at a

8635

-+	 * rate that would not be high enough to complete the budget

8636

-+	 * before the budget timeout expiration.

8637

-+	 */

8638

-+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

8639

-+

8640

-+	/*

8641

-+	 * Caveat: processes doing IO in the slower disk zones will

8642

-+	 * tend to be slow(er) even if not seeky. And the estimated

8643

-+	 * peak rate will actually be an average over the disk

8644

-+	 * surface. Hence, to not be too harsh with unlucky processes,

8645

-+	 * we keep a budget/3 margin of safety before declaring a

8646

-+	 * process slow.

8647

-+	 */

8648

-+	return expected > (4 * bfqq->entity.budget) / 3;

8649

-+}

8650

-+

8651

-+/**

8652

-+ * bfq_bfqq_expire - expire a queue.

8653

-+ * @bfqd: device owning the queue.

8654

-+ * @bfqq: the queue to expire.

8655

-+ * @compensate: if true, compensate for the time spent idling.

8656

-+ * @reason: the reason causing the expiration.

8657

-+ *

8658

-+ *

8659

-+ * If the process associated to the queue is slow (i.e., seeky), or in

8660

-+ * case of budget timeout, or, finally, if it is async, we

8661

-+ * artificially charge it an entire budget (independently of the

8662

-+ * actual service it received). As a consequence, the queue will get

8663

-+ * higher timestamps than the correct ones upon reactivation, and

8664

-+ * hence it will be rescheduled as if it had received more service

8665

-+ * than what it actually received. In the end, this class of processes

8666

-+ * will receive less service in proportion to how slowly they consume

8667

-+ * their budgets (and hence how seriously they tend to lower the

8668

-+ * throughput).

8669

-+ *

8670

-+ * In contrast, when a queue expires because it has been idling for

8671

-+ * too much or because it exhausted its budget, we do not touch the

8672

-+ * amount of service it has received. Hence when the queue will be

8673

-+ * reactivated and its timestamps updated, the latter will be in sync

8674

-+ * with the actual service received by the queue until expiration.

8675

-+ *

8676

-+ * Charging a full budget to the first type of queues and the exact

8677

-+ * service to the others has the effect of using the WF2Q+ policy to

8678

-+ * schedule the former on a timeslice basis, without violating the

8679

-+ * service domain guarantees of the latter.

8680

-+ */

8681

-+static void bfq_bfqq_expire(struct bfq_data *bfqd,

8682

-+			    struct bfq_queue *bfqq,

8683

-+			    int compensate,

8684

-+			    enum bfqq_expiration reason)

8685

-+{

8686

-+	int slow;

8687

-+	BUG_ON(bfqq != bfqd->active_queue);

8688

-+

8689

-+	/* Update disk peak rate for autotuning and check whether the

8690

-+	 * process is slow (see bfq_update_peak_rate).

8691

-+	 */

8692

-+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

8693

-+

8694

-+	/*

8695

-+	 * As above explained, 'punish' slow (i.e., seeky), timed-out

8696

-+	 * and async queues, to favor sequential sync workloads.

8697

-+	 *

8698

-+	 * Processes doing IO in the slower disk zones will tend to be

8699

-+	 * slow(er) even if not seeky. Hence, since the estimated peak

8700

-+	 * rate is actually an average over the disk surface, these

8701

-+	 * processes may timeout just for bad luck. To avoid punishing

8702

-+	 * them we do not charge a full budget to a process that

8703

-+	 * succeeded in consuming at least 2/3 of its budget.

8704

-+	 */

8705

-+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

8706

-+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

8707

-+		bfq_bfqq_charge_full_budget(bfqq);

8708

-+

8709

-+	if (bfqd->low_latency && bfqq->raising_coeff == 1)

8710

-+		bfqq->last_rais_start_finish = jiffies;

8711

-+

8712

-+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

8713

-+	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)

8714

-+		bfqq->soft_rt_next_start =

8715

-+			jiffies +

8716

-+			HZ * bfqq->entity.service /

8717

-+			bfqd->bfq_raising_max_softrt_rate;

8718

-+		else

8719

-+			bfqq->soft_rt_next_start = -1; /* infinity */

8720

-+	}

8721

-+	bfq_log_bfqq(bfqd, bfqq,

8722

-+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

8723

-+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

8724

-+

8725

-+	/* Increase, decrease or leave budget unchanged according to reason */

8726

-+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

8727

-+	__bfq_bfqq_expire(bfqd, bfqq);

8728

-+}

8729

-+

8730

-+/*

8731

-+ * Budget timeout is not implemented through a dedicated timer, but

8732

-+ * just checked on request arrivals and completions, as well as on

8733

-+ * idle timer expirations.

8734

-+ */

8735

-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

8736

-+{

8737

-+	if (bfq_bfqq_budget_new(bfqq))

8738

-+		return 0;

8739

-+

8740

-+	if (time_before(jiffies, bfqq->budget_timeout))

8741

-+		return 0;

8742

-+

8743

-+	return 1;

8744

-+}

8745

-+

8746

-+/*

8747

-+ * If we expire a queue that is waiting for the arrival of a new

8748

-+ * request, we may prevent the fictitious timestamp backshifting that

8749

-+ * allows the guarantees of the queue to be preserved (see [1] for

8750

-+ * this tricky aspect). Hence we return true only if this condition

8751

-+ * does not hold, or if the queue is slow enough to deserve only to be

8752

-+ * kicked off for preserving a high throughput.

8753

-+*/

8754

-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

8755

-+{

8756

-+	bfq_log_bfqq(bfqq->bfqd, bfqq,

8757

-+		"may_budget_timeout: wr %d left %d timeout %d",

8758

-+		bfq_bfqq_wait_request(bfqq),

8759

-+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

8760

-+		bfq_bfqq_budget_timeout(bfqq));

8761

-+

8762

-+	return (!bfq_bfqq_wait_request(bfqq) ||

8763

-+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

8764

-+		&&

8765

-+		bfq_bfqq_budget_timeout(bfqq);

8766

-+}

8767

-+

8768

-+/*

8769

-+ * If the active queue is empty, but it is sync and either of the following

8770

-+ * conditions holds, then: 1) the queue must remain active and cannot be

8771

-+ * expired, and 2) the disk must be idled to wait for the possible arrival

8772

-+ * of a new request for the queue. The conditions are:

8773

-+ * - the device is rotational and not performing NCQ, and the queue has its

8774

-+ *   idle window set (in this case, waiting for a new request for the queue

8775

-+ *   is likely to boost the disk throughput);

8776

-+ * - the queue is weight-raised (waiting for the request is necessary for

8777

-+ *   providing the queue with fairness and latency guarantees).

8778

-+ */

8779

-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

8780

-+				      int budg_timeout)

8781

-+{

8782

-+	struct bfq_data *bfqd = bfqq->bfqd;

8783

-+

8784

-+	return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&

8785

-+		bfqd->bfq_slice_idle != 0 &&

8786

-+		((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&

8787

-+		  !blk_queue_nonrot(bfqd->queue))

8788

-+		 || bfqq->raising_coeff > 1) &&

8789

-+		(bfqd->rq_in_driver == 0 ||

8790

-+				budg_timeout ||

8791

-+                                bfqq->raising_coeff > 1) &&

8792

-+                !bfq_close_cooperator(bfqd, bfqq) &&

8793

-+                (!bfq_bfqq_coop(bfqq) ||

8794

-+			!bfq_bfqq_some_coop_idle(bfqq)) &&

8795

-+		!bfq_queue_nonrot_noidle(bfqd, bfqq));

8796

-+}

8797

-+

8798

-+/*

8799

-+ * Select a queue for service.  If we have a current active queue,

8800

-+ * check whether to continue servicing it, or retrieve and set a new one.

8801

-+ */

8802

-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

8803

-+{

8804

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

8805

-+	struct request *next_rq;

8806

-+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

8807

-+	int budg_timeout;

8808

-+

8809

-+	bfqq = bfqd->active_queue;

8810

-+	if (bfqq == NULL)

8811

-+		goto new_queue;

8812

-+

8813

-+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

8814

-+

8815

-+	/*

8816

-+         * If another queue has a request waiting within our mean seek

8817

-+         * distance, let it run. The expire code will check for close

8818

-+         * cooperators and put the close queue at the front of the

8819

-+         * service tree. If possible, merge the expiring queue with the

8820

-+         * new bfqq.

8821

-+         */

8822

-+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

8823

-+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

8824

-+                bfq_setup_merge(bfqq, new_bfqq);

8825

-+

8826

-+	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

8827

-+	if (budg_timeout &&

8828

-+	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

8829

-+		goto expire;

8830

-+

8831

-+	next_rq = bfqq->next_rq;

8832

-+	/*

8833

-+	 * If bfqq has requests queued and it has enough budget left to

8834

-+	 * serve them, keep the queue, otherwise expire it.

8835

-+	 */

8836

-+	if (next_rq != NULL) {

8837

-+		if (bfq_serv_to_charge(next_rq, bfqq) >

8838

-+			bfq_bfqq_budget_left(bfqq)) {

8839

-+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

8840

-+			goto expire;

8841

-+		} else {

8842

-+			/*

8843

-+			 * The idle timer may be pending because we may not

8844

-+			 * disable disk idling even when a new request arrives

8845

-+			 */

8846

-+			if (timer_pending(&bfqd->idle_slice_timer)) {

8847

-+				/*

8848

-+				 * If we get here: 1) at least a new request

8849

-+				 * has arrived but we have not disabled the

8850

-+				 * timer because the request was too small,

8851

-+				 * 2) then the block layer has unplugged the

8852

-+				 * device, causing the dispatch to be invoked.

8853

-+				 *

8854

-+				 * Since the device is unplugged, now the

8855

-+				 * requests are probably large enough to

8856

-+				 * provide a reasonable throughput.

8857

-+				 * So we disable idling.

8858

-+				 */

8859

-+				bfq_clear_bfqq_wait_request(bfqq);

8860

-+				del_timer(&bfqd->idle_slice_timer);

8861

-+			}

8862

-+			if (new_bfqq == NULL)

8863

-+				goto keep_queue;

8864

-+			else

8865

-+				goto expire;

8866

-+		}

8867

-+	}

8868

-+

8869

-+	/*

8870

-+	 * No requests pending.  If there is no cooperator, and the active

8871

-+	 * queue still has requests in flight or is idling for a new request,

8872

-+	 * then keep it.

8873

-+	 */

8874

-+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

8875

-+	    (bfqq->dispatched != 0 &&

8876

-+	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

8877

-+	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

8878

-+		bfqq = NULL;

8879

-+		goto keep_queue;

8880

-+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

8881

-+		/*

8882

-+		 * Expiring the queue because there is a close cooperator,

8883

-+		 * cancel timer.

8884

-+		 */

8885

-+		bfq_clear_bfqq_wait_request(bfqq);

8886

-+		del_timer(&bfqd->idle_slice_timer);

8887

-+	}

8888

-+

8889

-+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

8890

-+expire:

8891

-+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

8892

-+new_queue:

8893

-+	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

8894

-+	bfq_log(bfqd, "select_queue: new queue %d returned",

8895

-+		bfqq != NULL ? bfqq->pid : 0);

8896

-+keep_queue:

8897

-+	return bfqq;

8898

-+}

8899

-+

8900

-+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8901

-+{

8902

-+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

8903

-+		struct bfq_entity *entity = &bfqq->entity;

8904

-+

8905

-+		bfq_log_bfqq(bfqd, bfqq,

8906

-+			"raising period dur %u/%u msec, "

8907

-+			"old raising coeff %u, w %d(%d)",

8908

-+			jiffies_to_msecs(jiffies -

8909

-+				bfqq->last_rais_start_finish),

8910

-+			jiffies_to_msecs(bfqq->raising_cur_max_time),

8911

-+			bfqq->raising_coeff,

8912

-+			bfqq->entity.weight, bfqq->entity.orig_weight);

8913

-+

8914

-+		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=

8915

-+			entity->orig_weight * bfqq->raising_coeff);

8916

-+		if(entity->ioprio_changed)

8917

-+			bfq_log_bfqq(bfqd, bfqq,

8918

-+			"WARN: pending prio change");

8919

-+		/*

8920

-+		 * If too much time has elapsed from the beginning

8921

-+		 * of this weight-raising period and process is not soft

8922

-+		 * real-time, stop it

8923

-+		 */

8924

-+		if (jiffies - bfqq->last_rais_start_finish >

8925

-+			bfqq->raising_cur_max_time) {

8926

-+			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

8927

-+				bfqq->soft_rt_next_start < jiffies;

8928

-+

8929

-+			bfqq->last_rais_start_finish = jiffies;

8930

-+			if (soft_rt)

8931

-+				bfqq->raising_cur_max_time =

8932

-+					bfqd->bfq_raising_rt_max_time;

8933

-+			else {

8934

-+				bfq_log_bfqq(bfqd, bfqq,

8935

-+					     "wrais ending at %llu msec,"

8936

-+					     "rais_max_time %u",

8937

-+					     bfqq->last_rais_start_finish,

8938

-+					     jiffies_to_msecs(bfqq->

8939

-+						raising_cur_max_time));

8940

-+				bfq_bfqq_end_raising(bfqq);

8941

-+				__bfq_entity_update_weight_prio(

8942

-+					bfq_entity_service_tree(entity),

8943

-+					entity);

8944

-+			}

8945

-+		}

8946

-+	}

8947

-+}

8948

-+

8949

-+/*

8950

-+ * Dispatch one request from bfqq, moving it to the request queue

8951

-+ * dispatch list.

8952

-+ */

8953

-+static int bfq_dispatch_request(struct bfq_data *bfqd,

8954

-+				struct bfq_queue *bfqq)

8955

-+{

8956

-+	int dispatched = 0;

8957

-+	struct request *rq;

8958

-+	unsigned long service_to_charge;

8959

-+

8960

-+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

8961

-+

8962

-+	/* Follow expired path, else get first next available. */

8963

-+	rq = bfq_check_fifo(bfqq);

8964

-+	if (rq == NULL)

8965

-+		rq = bfqq->next_rq;

8966

-+	service_to_charge = bfq_serv_to_charge(rq, bfqq);

8967

-+

8968

-+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

8969

-+		/*

8970

-+		 * This may happen if the next rq is chosen

8971

-+		 * in fifo order instead of sector order.

8972

-+		 * The budget is properly dimensioned

8973

-+		 * to be always sufficient to serve the next request

8974

-+		 * only if it is chosen in sector order. The reason is

8975

-+		 * that it would be quite inefficient and little useful

8976

-+		 * to always make sure that the budget is large enough

8977

-+		 * to serve even the possible next rq in fifo order.

8978

-+		 * In fact, requests are seldom served in fifo order.

8979

-+		 *

8980

-+		 * Expire the queue for budget exhaustion, and

8981

-+		 * make sure that the next act_budget is enough

8982

-+		 * to serve the next request, even if it comes

8983

-+		 * from the fifo expired path.

8984

-+		 */

8985

-+		bfqq->next_rq = rq;

8986

-+		/*

8987

-+		 * Since this dispatch is failed, make sure that

8988

-+		 * a new one will be performed

8989

-+		 */

8990

-+		if (!bfqd->rq_in_driver)

8991

-+			bfq_schedule_dispatch(bfqd);

8992

-+		goto expire;

8993

-+	}

8994

-+

8995

-+	/* Finally, insert request into driver dispatch list. */

8996

-+	bfq_bfqq_served(bfqq, service_to_charge);

8997

-+	bfq_dispatch_insert(bfqd->queue, rq);

8998

-+

8999

-+	update_raising_data(bfqd, bfqq);

9000

-+

9001

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "

9002

-+			"budg left %lu",

9003

-+			blk_rq_sectors(rq),

9004

-+			(long long unsigned)blk_rq_pos(rq),

9005

-+			bfq_bfqq_budget_left(bfqq));

9006

-+

9007

-+	dispatched++;

9008

-+

9009

-+	if (bfqd->active_bic == NULL) {

9010

-+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

9011

-+		bfqd->active_bic = RQ_BIC(rq);

9012

-+	}

9013

-+

9014

-+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

9015

-+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

9016

-+	    bfq_class_idle(bfqq)))

9017

-+		goto expire;

9018

-+

9019

-+	return dispatched;

9020

-+

9021

-+expire:

9022

-+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

9023

-+	return dispatched;

9024

-+}

9025

-+

9026

-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

9027

-+{

9028

-+	int dispatched = 0;

9029

-+

9030

-+	while (bfqq->next_rq != NULL) {

9031

-+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

9032

-+		dispatched++;

9033

-+	}

9034

-+

9035

-+	BUG_ON(!list_empty(&bfqq->fifo));

9036

-+	return dispatched;

9037

-+}

9038

-+

9039

-+/*

9040

-+ * Drain our current requests.  Used for barriers and when switching

9041

-+ * io schedulers on-the-fly.

9042

-+ */

9043

-+static int bfq_forced_dispatch(struct bfq_data *bfqd)

9044

-+{

9045

-+	struct bfq_queue *bfqq, *n;

9046

-+	struct bfq_service_tree *st;

9047

-+	int dispatched = 0;

9048

-+

9049

-+	bfqq = bfqd->active_queue;

9050

-+	if (bfqq != NULL)

9051

-+		__bfq_bfqq_expire(bfqd, bfqq);

9052

-+

9053

-+	/*

9054

-+	 * Loop through classes, and be careful to leave the scheduler

9055

-+	 * in a consistent state, as feedback mechanisms and vtime

9056

-+	 * updates cannot be disabled during the process.

9057

-+	 */

9058

-+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

9059

-+		st = bfq_entity_service_tree(&bfqq->entity);

9060

-+

9061

-+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

9062

-+		bfqq->max_budget = bfq_max_budget(bfqd);

9063

-+

9064

-+		bfq_forget_idle(st);

9065

-+	}

9066

-+

9067

-+	BUG_ON(bfqd->busy_queues != 0);

9068

-+

9069

-+	return dispatched;

9070

-+}

9071

-+

9072

-+static int bfq_dispatch_requests(struct request_queue *q, int force)

9073

-+{

9074

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9075

-+	struct bfq_queue *bfqq;

9076

-+	int max_dispatch;

9077

-+

9078

-+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

9079

-+	if (bfqd->busy_queues == 0)

9080

-+		return 0;

9081

-+

9082

-+	if (unlikely(force))

9083

-+		return bfq_forced_dispatch(bfqd);

9084

-+

9085

-+	if((bfqq = bfq_select_queue(bfqd)) == NULL)

9086

-+		return 0;

9087

-+

9088

-+	max_dispatch = bfqd->bfq_quantum;

9089

-+	if (bfq_class_idle(bfqq))

9090

-+		max_dispatch = 1;

9091

-+

9092

-+	if (!bfq_bfqq_sync(bfqq))

9093

-+		max_dispatch = bfqd->bfq_max_budget_async_rq;

9094

-+

9095

-+	if (bfqq->dispatched >= max_dispatch) {

9096

-+		if (bfqd->busy_queues > 1)

9097

-+			return 0;

9098

-+		if (bfqq->dispatched >= 4 * max_dispatch)

9099

-+			return 0;

9100

-+	}

9101

-+

9102

-+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

9103

-+		return 0;

9104

-+

9105

-+	bfq_clear_bfqq_wait_request(bfqq);

9106

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

9107

-+

9108

-+	if (! bfq_dispatch_request(bfqd, bfqq))

9109

-+		return 0;

9110

-+

9111

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"

9112

-+		     "(max_disp %d)", bfqq->pid, max_dispatch);

9113

-+

9114

-+	return 1;

9115

-+}

9116

-+

9117

-+/*

9118

-+ * Task holds one reference to the queue, dropped when task exits.  Each rq

9119

-+ * in-flight on this queue also holds a reference, dropped when rq is freed.

9120

-+ *

9121

-+ * Queue lock must be held here.

9122

-+ */

9123

-+static void bfq_put_queue(struct bfq_queue *bfqq)

9124

-+{

9125

-+	struct bfq_data *bfqd = bfqq->bfqd;

9126

-+

9127

-+	BUG_ON(atomic_read(&bfqq->ref) <= 0);

9128

-+

9129

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

9130

-+		     atomic_read(&bfqq->ref));

9131

-+	if (!atomic_dec_and_test(&bfqq->ref))

9132

-+		return;

9133

-+

9134

-+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

9135

-+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

9136

-+	BUG_ON(bfqq->entity.tree != NULL);

9137

-+	BUG_ON(bfq_bfqq_busy(bfqq));

9138

-+	BUG_ON(bfqd->active_queue == bfqq);

9139

-+

9140

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

9141

-+

9142

-+	kmem_cache_free(bfq_pool, bfqq);

9143

-+}

9144

-+

9145

-+static void bfq_put_cooperator(struct bfq_queue *bfqq)

9146

-+{

9147

-+	struct bfq_queue *__bfqq, *next;

9148

-+

9149

-+	/*

9150

-+	 * If this queue was scheduled to merge with another queue, be

9151

-+	 * sure to drop the reference taken on that queue (and others in

9152

-+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

9153

-+	 */

9154

-+	__bfqq = bfqq->new_bfqq;

9155

-+	while (__bfqq) {

9156

-+		if (__bfqq == bfqq) {

9157

-+			WARN(1, "bfqq->new_bfqq loop detected.\n");

9158

-+			break;

9159

-+		}

9160

-+		next = __bfqq->new_bfqq;

9161

-+		bfq_put_queue(__bfqq);

9162

-+		__bfqq = next;

9163

-+	}

9164

-+}

9165

-+

9166

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9167

-+{

9168

-+	if (bfqq == bfqd->active_queue) {

9169

-+		__bfq_bfqq_expire(bfqd, bfqq);

9170

-+		bfq_schedule_dispatch(bfqd);

9171

-+	}

9172

-+

9173

-+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

9174

-+		     atomic_read(&bfqq->ref));

9175

-+

9176

-+	bfq_put_cooperator(bfqq);

9177

-+

9178

-+	bfq_put_queue(bfqq);

9179

-+}

9180

-+

9181

-+static void bfq_init_icq(struct io_cq *icq)

9182

-+{

9183

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

9184

-+

9185

-+	bic->ttime.last_end_request = jiffies;

9186

-+}

9187

-+

9188

-+static void bfq_exit_icq(struct io_cq *icq)

9189

-+{

9190

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

9191

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

9192

-+

9193

-+	if (bic->bfqq[BLK_RW_ASYNC]) {

9194

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

9195

-+		bic->bfqq[BLK_RW_ASYNC] = NULL;

9196

-+	}

9197

-+

9198

-+	if (bic->bfqq[BLK_RW_SYNC]) {

9199

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

9200

-+		bic->bfqq[BLK_RW_SYNC] = NULL;

9201

-+	}

9202

-+}

9203

-+

9204

-+/*

9205

-+ * Update the entity prio values; note that the new values will not

9206

-+ * be used until the next (re)activation.

9207

-+ */

9208

-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

9209

-+{

9210

-+	struct task_struct *tsk = current;

9211

-+	int ioprio_class;

9212

-+

9213

-+	if (!bfq_bfqq_prio_changed(bfqq))

9214

-+		return;

9215

-+

9216

-+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9217

-+	switch (ioprio_class) {

9218

-+	default:

9219

-+		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);

9220

-+	case IOPRIO_CLASS_NONE:

9221

-+		/*

9222

-+		 * No prio set, inherit CPU scheduling settings.

9223

-+		 */

9224

-+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

9225

-+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

9226

-+		break;

9227

-+	case IOPRIO_CLASS_RT:

9228

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9229

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

9230

-+		break;

9231

-+	case IOPRIO_CLASS_BE:

9232

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9233

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

9234

-+		break;

9235

-+	case IOPRIO_CLASS_IDLE:

9236

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

9237

-+		bfqq->entity.new_ioprio = 7;

9238

-+		bfq_clear_bfqq_idle_window(bfqq);

9239

-+		break;

9240

-+	}

9241

-+

9242

-+	bfqq->entity.ioprio_changed = 1;

9243

-+

9244

-+	/*

9245

-+	 * Keep track of original prio settings in case we have to temporarily

9246

-+	 * elevate the priority of this queue.

9247

-+	 */

9248

-+	bfqq->org_ioprio = bfqq->entity.new_ioprio;

9249

-+	bfq_clear_bfqq_prio_changed(bfqq);

9250

-+}

9251

-+

9252

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic)

9253

-+{

9254

-+	struct bfq_data *bfqd;

9255

-+	struct bfq_queue *bfqq, *new_bfqq;

9256

-+	struct bfq_group *bfqg;

9257

-+	unsigned long uninitialized_var(flags);

9258

-+	int ioprio = bic->icq.ioc->ioprio;

9259

-+

9260

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

9261

-+	/*

9262

-+	 * This condition may trigger on a newly created bic, be sure to drop the

9263

-+	 * lock before returning.

9264

-+	 */

9265

-+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

9266

-+		goto out;

9267

-+

9268

-+	bfqq = bic->bfqq[BLK_RW_ASYNC];

9269

-+	if (bfqq != NULL) {

9270

-+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

9271

-+				    sched_data);

9272

-+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

9273

-+					 GFP_ATOMIC);

9274

-+		if (new_bfqq != NULL) {

9275

-+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

9276

-+			bfq_log_bfqq(bfqd, bfqq,

9277

-+				     "changed_ioprio: bfqq %p %d",

9278

-+				     bfqq, atomic_read(&bfqq->ref));

9279

-+			bfq_put_queue(bfqq);

9280

-+		}

9281

-+	}

9282

-+

9283

-+	bfqq = bic->bfqq[BLK_RW_SYNC];

9284

-+	if (bfqq != NULL)

9285

-+		bfq_mark_bfqq_prio_changed(bfqq);

9286

-+

9287

-+	bic->ioprio = ioprio;

9288

-+

9289

-+out:

9290

-+	bfq_put_bfqd_unlock(bfqd, &flags);

9291

-+}

9292

-+

9293

-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9294

-+			  pid_t pid, int is_sync)

9295

-+{

9296

-+	RB_CLEAR_NODE(&bfqq->entity.rb_node);

9297

-+	INIT_LIST_HEAD(&bfqq->fifo);

9298

-+

9299

-+	atomic_set(&bfqq->ref, 0);

9300

-+	bfqq->bfqd = bfqd;

9301

-+

9302

-+	bfq_mark_bfqq_prio_changed(bfqq);

9303

-+

9304

-+	if (is_sync) {

9305

-+		if (!bfq_class_idle(bfqq))

9306

-+			bfq_mark_bfqq_idle_window(bfqq);

9307

-+		bfq_mark_bfqq_sync(bfqq);

9308

-+	}

9309

-+

9310

-+	/* Tentative initial value to trade off between thr and lat */

9311

-+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

9312

-+	bfqq->pid = pid;

9313

-+

9314

-+	bfqq->raising_coeff = 1;

9315

-+	bfqq->last_rais_start_finish = 0;

9316

-+	bfqq->soft_rt_next_start = -1;

9317

-+}

9318

-+

9319

-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

9320

-+					      struct bfq_group *bfqg,

9321

-+					      int is_sync,

9322

-+					      struct bfq_io_cq *bic,

9323

-+					      gfp_t gfp_mask)

9324

-+{

9325

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

9326

-+

9327

-+retry:

9328

-+	/* bic always exists here */

9329

-+	bfqq = bic_to_bfqq(bic, is_sync);

9330

-+

9331

-+	/*

9332

-+	 * Always try a new alloc if we fall back to the OOM bfqq

9333

-+	 * originally, since it should just be a temporary situation.

9334

-+	 */

9335

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

9336

-+		bfqq = NULL;

9337

-+		if (new_bfqq != NULL) {

9338

-+			bfqq = new_bfqq;

9339

-+			new_bfqq = NULL;

9340

-+		} else if (gfp_mask & __GFP_WAIT) {

9341

-+			spin_unlock_irq(bfqd->queue->queue_lock);

9342

-+			new_bfqq = kmem_cache_alloc_node(bfq_pool,

9343

-+					gfp_mask | __GFP_ZERO,

9344

-+					bfqd->queue->node);

9345

-+			spin_lock_irq(bfqd->queue->queue_lock);

9346

-+			if (new_bfqq != NULL)

9347

-+				goto retry;

9348

-+		} else {

9349

-+			bfqq = kmem_cache_alloc_node(bfq_pool,

9350

-+					gfp_mask | __GFP_ZERO,

9351

-+					bfqd->queue->node);

9352

-+		}

9353

-+

9354

-+		if (bfqq != NULL) {

9355

-+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

9356

-+			bfq_log_bfqq(bfqd, bfqq, "allocated");

9357

-+		} else {

9358

-+			bfqq = &bfqd->oom_bfqq;

9359

-+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

9360

-+		}

9361

-+

9362

-+		bfq_init_prio_data(bfqq, bic);

9363

-+		bfq_init_entity(&bfqq->entity, bfqg);

9364

-+	}

9365

-+

9366

-+	if (new_bfqq != NULL)

9367

-+		kmem_cache_free(bfq_pool, new_bfqq);

9368

-+

9369

-+	return bfqq;

9370

-+}

9371

-+

9372

-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

9373

-+					       struct bfq_group *bfqg,

9374

-+					       int ioprio_class, int ioprio)

9375

-+{

9376

-+	switch (ioprio_class) {

9377

-+	case IOPRIO_CLASS_RT:

9378

-+		return &bfqg->async_bfqq[0][ioprio];

9379

-+	case IOPRIO_CLASS_NONE:

9380

-+		ioprio = IOPRIO_NORM;

9381

-+		/* fall through */

9382

-+	case IOPRIO_CLASS_BE:

9383

-+		return &bfqg->async_bfqq[1][ioprio];

9384

-+	case IOPRIO_CLASS_IDLE:

9385

-+		return &bfqg->async_idle_bfqq;

9386

-+	default:

9387

-+		BUG();

9388

-+	}

9389

-+}

9390

-+

9391

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

9392

-+				       struct bfq_group *bfqg, int is_sync,

9393

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask)

9394

-+{

9395

-+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9396

-+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9397

-+	struct bfq_queue **async_bfqq = NULL;

9398

-+	struct bfq_queue *bfqq = NULL;

9399

-+

9400

-+	if (!is_sync) {

9401

-+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

9402

-+						  ioprio);

9403

-+		bfqq = *async_bfqq;

9404

-+	}

9405

-+

9406

-+	if (bfqq == NULL)

9407

-+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

9408

-+

9409

-+	/*

9410

-+	 * Pin the queue now that it's allocated, scheduler exit will prune it.

9411

-+	 */

9412

-+	if (!is_sync && *async_bfqq == NULL) {

9413

-+		atomic_inc(&bfqq->ref);

9414

-+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

9415

-+			     bfqq, atomic_read(&bfqq->ref));

9416

-+		*async_bfqq = bfqq;

9417

-+	}

9418

-+

9419

-+	atomic_inc(&bfqq->ref);

9420

-+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

9421

-+		     atomic_read(&bfqq->ref));

9422

-+	return bfqq;

9423

-+}

9424

-+

9425

-+static void bfq_update_io_thinktime(struct bfq_data *bfqd,

9426

-+				    struct bfq_io_cq *bic)

9427

-+{

9428

-+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

9429

-+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

9430

-+

9431

-+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

9432

-+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

9433

-+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;

9434

-+}

9435

-+

9436

-+static void bfq_update_io_seektime(struct bfq_data *bfqd,

9437

-+				   struct bfq_queue *bfqq,

9438

-+				   struct request *rq)

9439

-+{

9440

-+	sector_t sdist;

9441

-+	u64 total;

9442

-+

9443

-+	if (bfqq->last_request_pos < blk_rq_pos(rq))

9444

-+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

9445

-+	else

9446

-+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

9447

-+

9448

-+	/*

9449

-+	 * Don't allow the seek distance to get too large from the

9450

-+	 * odd fragment, pagein, etc.

9451

-+	 */

9452

-+	if (bfqq->seek_samples == 0) /* first request, not really a seek */

9453

-+		sdist = 0;

9454

-+	else if (bfqq->seek_samples <= 60) /* second & third seek */

9455

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

9456

-+	else

9457

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

9458

-+

9459

-+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

9460

-+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

9461

-+	total = bfqq->seek_total + (bfqq->seek_samples/2);

9462

-+	do_div(total, bfqq->seek_samples);

9463

-+	if (bfq_bfqq_coop(bfqq)) {

9464

-+		/*

9465

-+		 * If the mean seektime increases for a (non-seeky) shared

9466

-+		 * queue, some cooperator is likely to be idling too much.

9467

-+		 * On the contrary,  if it decreases, some cooperator has

9468

-+		 * probably waked up.

9469

-+		 *

9470

-+		 */

9471

-+		if ((sector_t)total < bfqq->seek_mean)

9472

-+			bfq_mark_bfqq_some_coop_idle(bfqq) ;

9473

-+		else if ((sector_t)total > bfqq->seek_mean)

9474

-+			bfq_clear_bfqq_some_coop_idle(bfqq) ;

9475

-+	}

9476

-+	bfqq->seek_mean = (sector_t)total;

9477

-+

9478

-+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

9479

-+			(u64)bfqq->seek_mean);

9480

-+}

9481

-+

9482

-+/*

9483

-+ * Disable idle window if the process thinks too long or seeks so much that

9484

-+ * it doesn't matter.

9485

-+ */

9486

-+static void bfq_update_idle_window(struct bfq_data *bfqd,

9487

-+				   struct bfq_queue *bfqq,

9488

-+				   struct bfq_io_cq *bic)

9489

-+{

9490

-+	int enable_idle;

9491

-+

9492

-+	/* Don't idle for async or idle io prio class. */

9493

-+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

9494

-+		return;

9495

-+

9496

-+	enable_idle = bfq_bfqq_idle_window(bfqq);

9497

-+

9498

-+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

9499

-+	    bfqd->bfq_slice_idle == 0 ||

9500

-+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

9501

-+			bfqq->raising_coeff == 1))

9502

-+		enable_idle = 0;

9503

-+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

9504

-+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

9505

-+			bfqq->raising_coeff == 1)

9506

-+			enable_idle = 0;

9507

-+		else

9508

-+			enable_idle = 1;

9509

-+	}

9510

-+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

9511

-+		enable_idle);

9512

-+

9513

-+	if (enable_idle)

9514

-+		bfq_mark_bfqq_idle_window(bfqq);

9515

-+	else

9516

-+		bfq_clear_bfqq_idle_window(bfqq);

9517

-+}

9518

-+

9519

-+/*

9520

-+ * Called when a new fs request (rq) is added to bfqq.  Check if there's

9521

-+ * something we should do about it.

9522

-+ */

9523

-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9524

-+			    struct request *rq)

9525

-+{

9526

-+	struct bfq_io_cq *bic = RQ_BIC(rq);

9527

-+

9528

-+	if (rq->cmd_flags & REQ_META)

9529

-+		bfqq->meta_pending++;

9530

-+

9531

-+	bfq_update_io_thinktime(bfqd, bic);

9532

-+	bfq_update_io_seektime(bfqd, bfqq, rq);

9533

-+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

9534

-+	    !BFQQ_SEEKY(bfqq))

9535

-+		bfq_update_idle_window(bfqd, bfqq, bic);

9536

-+

9537

-+	bfq_log_bfqq(bfqd, bfqq,

9538

-+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

9539

-+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

9540

-+		     (long long unsigned)bfqq->seek_mean);

9541

-+

9542

-+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

9543

-+

9544

-+	if (bfqq == bfqd->active_queue) {

9545

-+		/*

9546

-+		 * If there is just this request queued and the request

9547

-+		 * is small, just exit.

9548

-+		 * In this way, if the disk is being idled to wait for a new

9549

-+		 * request from the active queue, we avoid unplugging the

9550

-+		 * device now.

9551

-+		 *

9552

-+		 * By doing so, we spare the disk to be committed

9553

-+		 * to serve just a small request. On the contrary, we wait for

9554

-+		 * the block layer to decide when to unplug the device:

9555

-+		 * hopefully, new requests will be merged to this

9556

-+		 * one quickly, then the device will be unplugged

9557

-+		 * and larger requests will be dispatched.

9558

-+		 */

9559

-+	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&

9560

-+		    blk_rq_sectors(rq) < 32) {

9561

-+		        return;

9562

-+		}

9563

-+		if (bfq_bfqq_wait_request(bfqq)) {

9564

-+			/*

9565

-+			 * If we are waiting for a request for this queue, let

9566

-+			 * it rip immediately and flag that we must not expire

9567

-+			 * this queue just now.

9568

-+			 */

9569

-+			bfq_clear_bfqq_wait_request(bfqq);

9570

-+			del_timer(&bfqd->idle_slice_timer);

9571

-+			/*

9572

-+			 * Here we can safely expire the queue, in

9573

-+			 * case of budget timeout, without wasting

9574

-+			 * guarantees

9575

-+			 */

9576

-+			if (bfq_bfqq_budget_timeout(bfqq))

9577

-+				bfq_bfqq_expire(bfqd, bfqq, 0,

9578

-+						BFQ_BFQQ_BUDGET_TIMEOUT);

9579

-+			__blk_run_queue(bfqd->queue);

9580

-+		}

9581

-+	}

9582

-+}

9583

-+

9584

-+static void bfq_insert_request(struct request_queue *q, struct request *rq)

9585

-+{

9586

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9587

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9588

-+

9589

-+	assert_spin_locked(bfqd->queue->queue_lock);

9590

-+	bfq_init_prio_data(bfqq, RQ_BIC(rq));

9591

-+

9592

-+	bfq_add_rq_rb(rq);

9593

-+

9594

-+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

9595

-+	list_add_tail(&rq->queuelist, &bfqq->fifo);

9596

-+

9597

-+	bfq_rq_enqueued(bfqd, bfqq, rq);

9598

-+}

9599

-+

9600

-+static void bfq_update_hw_tag(struct bfq_data *bfqd)

9601

-+{

9602

-+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

9603

-+				     bfqd->rq_in_driver);

9604

-+

9605

-+	if (bfqd->hw_tag == 1)

9606

-+		return;

9607

-+

9608

-+	/*

9609

-+	 * This sample is valid if the number of outstanding requests

9610

-+	 * is large enough to allow a queueing behavior.  Note that the

9611

-+	 * sum is not exact, as it's not taking into account deactivated

9612

-+	 * requests.

9613

-+	 */

9614

-+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

9615

-+		return;

9616

-+

9617

-+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

9618

-+		return;

9619

-+

9620

-+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

9621

-+	bfqd->max_rq_in_driver = 0;

9622

-+	bfqd->hw_tag_samples = 0;

9623

-+}

9624

-+

9625

-+static void bfq_completed_request(struct request_queue *q, struct request *rq)

9626

-+{

9627

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9628

-+	struct bfq_data *bfqd = bfqq->bfqd;

9629

-+	const int sync = rq_is_sync(rq);

9630

-+

9631

-+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

9632

-+			blk_rq_sectors(rq), sync);

9633

-+

9634

-+	bfq_update_hw_tag(bfqd);

9635

-+

9636

-+	WARN_ON(!bfqd->rq_in_driver);

9637

-+	WARN_ON(!bfqq->dispatched);

9638

-+	bfqd->rq_in_driver--;

9639

-+	bfqq->dispatched--;

9640

-+

9641

-+	if (bfq_bfqq_sync(bfqq))

9642

-+		bfqd->sync_flight--;

9643

-+

9644

-+	if (sync)

9645

-+		RQ_BIC(rq)->ttime.last_end_request = jiffies;

9646

-+

9647

-+	/*

9648

-+	 * If this is the active queue, check if it needs to be expired,

9649

-+	 * or if we want to idle in case it has no pending requests.

9650

-+	 */

9651

-+	if (bfqd->active_queue == bfqq) {

9652

-+		int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

9653

-+		if (bfq_bfqq_budget_new(bfqq))

9654

-+			bfq_set_budget_timeout(bfqd);

9655

-+

9656

-+		/* Idling is disabled also for cooperation issues:

9657

-+		 * 1) there is a close cooperator for the queue, or

9658

-+		 * 2) the queue is shared and some cooperator is likely

9659

-+		 *    to be idle (in this case, by not arming the idle timer,

9660

-+		 *    we try to slow down the queue, to prevent the zones

9661

-+		 *    of the disk accessed by the active cooperators to become

9662

-+		 *    too distant from the zone that will be accessed by the

9663

-+		 *    currently idle cooperators)

9664

-+		 */

9665

-+		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

9666

-+			bfq_arm_slice_timer(bfqd);

9667

-+		else if (budg_timeout)

9668

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

9669

-+	}

9670

-+

9671

-+	if (!bfqd->rq_in_driver)

9672

-+		bfq_schedule_dispatch(bfqd);

9673

-+}

9674

-+

9675

-+static inline int __bfq_may_queue(struct bfq_queue *bfqq)

9676

-+{

9677

-+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

9678

-+		bfq_clear_bfqq_must_alloc(bfqq);

9679

-+		return ELV_MQUEUE_MUST;

9680

-+	}

9681

-+

9682

-+	return ELV_MQUEUE_MAY;

9683

-+}

9684

-+

9685

-+static int bfq_may_queue(struct request_queue *q, int rw)

9686

-+{

9687

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9688

-+	struct task_struct *tsk = current;

9689

-+	struct bfq_io_cq *bic;

9690

-+	struct bfq_queue *bfqq;

9691

-+

9692

-+	/*

9693

-+	 * Don't force setup of a queue from here, as a call to may_queue

9694

-+	 * does not necessarily imply that a request actually will be queued.

9695

-+	 * So just lookup a possibly existing queue, or return 'may queue'

9696

-+	 * if that fails.

9697

-+	 */

9698

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

9699

-+	if (bic == NULL)

9700

-+		return ELV_MQUEUE_MAY;

9701

-+

9702

-+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

9703

-+	if (bfqq != NULL) {

9704

-+		bfq_init_prio_data(bfqq, bic);

9705

-+

9706

-+		return __bfq_may_queue(bfqq);

9707

-+	}

9708

-+

9709

-+	return ELV_MQUEUE_MAY;

9710

-+}

9711

-+

9712

-+/*

9713

-+ * Queue lock held here.

9714

-+ */

9715

-+static void bfq_put_request(struct request *rq)

9716

-+{

9717

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9718

-+

9719

-+	if (bfqq != NULL) {

9720

-+		const int rw = rq_data_dir(rq);

9721

-+

9722

-+		BUG_ON(!bfqq->allocated[rw]);

9723

-+		bfqq->allocated[rw]--;

9724

-+

9725

-+		rq->elv.priv[0] = NULL;

9726

-+		rq->elv.priv[1] = NULL;

9727

-+

9728

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

9729

-+			     bfqq, atomic_read(&bfqq->ref));

9730

-+		bfq_put_queue(bfqq);

9731

-+	}

9732

-+}

9733

-+

9734

-+static struct bfq_queue *

9735

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

9736

-+                struct bfq_queue *bfqq)

9737

-+{

9738

-+        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

9739

-+		(long unsigned)bfqq->new_bfqq->pid);

9740

-+        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

9741

-+        bfq_mark_bfqq_coop(bfqq->new_bfqq);

9742

-+        bfq_put_queue(bfqq);

9743

-+        return bic_to_bfqq(bic, 1);

9744

-+}

9745

-+

9746

-+/*

9747

-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

9748

-+ * was the last process referring to said bfqq.

9749

-+ */

9750

-+static struct bfq_queue *

9751

-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

9752

-+{

9753

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

9754

-+	if (bfqq_process_refs(bfqq) == 1) {

9755

-+		bfqq->pid = current->pid;

9756

-+		bfq_clear_bfqq_some_coop_idle(bfqq);

9757

-+		bfq_clear_bfqq_coop(bfqq);

9758

-+		bfq_clear_bfqq_split_coop(bfqq);

9759

-+		return bfqq;

9760

-+	}

9761

-+

9762

-+	bic_set_bfqq(bic, NULL, 1);

9763

-+

9764

-+	bfq_put_cooperator(bfqq);

9765

-+

9766

-+	bfq_put_queue(bfqq);

9767

-+	return NULL;

9768

-+}

9769

-+

9770

-+/*

9771

-+ * Allocate bfq data structures associated with this request.

9772

-+ */

9773

-+static int bfq_set_request(struct request_queue *q, struct request *rq,

9774

-+			   struct bio *bio, gfp_t gfp_mask)

9775

-+{

9776

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9777

-+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

9778

-+	const int rw = rq_data_dir(rq);

9779

-+	const int is_sync = rq_is_sync(rq);

9780

-+	struct bfq_queue *bfqq;

9781

-+	struct bfq_group *bfqg;

9782

-+	unsigned long flags;

9783

-+

9784

-+	might_sleep_if(gfp_mask & __GFP_WAIT);

9785

-+

9786

-+	bfq_changed_ioprio(bic);

9787

-+

9788

-+	spin_lock_irqsave(q->queue_lock, flags);

9789

-+

9790

-+	if (bic == NULL)

9791

-+		goto queue_fail;

9792

-+

9793

-+	bfqg = bfq_bic_update_cgroup(bic);

9794

-+

9795

-+new_queue:

9796

-+	bfqq = bic_to_bfqq(bic, is_sync);

9797

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

9798

-+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

9799

-+		bic_set_bfqq(bic, bfqq, is_sync);

9800

-+	} else {

9801

-+		/*

9802

-+		 * If the queue was seeky for too long, break it apart.

9803

-+		 */

9804

-+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

9805

-+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

9806

-+			bfqq = bfq_split_bfqq(bic, bfqq);

9807

-+			if (!bfqq)

9808

-+				goto new_queue;

9809

-+		}

9810

-+

9811

-+		/*

9812

-+		 * Check to see if this queue is scheduled to merge with

9813

-+		 * another closely cooperating queue. The merging of queues

9814

-+		 * happens here as it must be done in process context.

9815

-+		 * The reference on new_bfqq was taken in merge_bfqqs.

9816

-+		 */

9817

-+		if (bfqq->new_bfqq != NULL)

9818

-+			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

9819

-+	}

9820

-+

9821

-+	bfqq->allocated[rw]++;

9822

-+	atomic_inc(&bfqq->ref);

9823

-+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

9824

-+		     atomic_read(&bfqq->ref));

9825

-+

9826

-+	rq->elv.priv[0] = bic;

9827

-+	rq->elv.priv[1] = bfqq;

9828

-+

9829

-+	spin_unlock_irqrestore(q->queue_lock, flags);

9830

-+

9831

-+	return 0;

9832

-+

9833

-+queue_fail:

9834

-+	bfq_schedule_dispatch(bfqd);

9835

-+	spin_unlock_irqrestore(q->queue_lock, flags);

9836

-+

9837

-+	return 1;

9838

-+}

9839

-+

9840

-+static void bfq_kick_queue(struct work_struct *work)

9841

-+{

9842

-+	struct bfq_data *bfqd =

9843

-+		container_of(work, struct bfq_data, unplug_work);

9844

-+	struct request_queue *q = bfqd->queue;

9845

-+

9846

-+	spin_lock_irq(q->queue_lock);

9847

-+	__blk_run_queue(q);

9848

-+	spin_unlock_irq(q->queue_lock);

9849

-+}

9850

-+

9851

-+/*

9852

-+ * Handler of the expiration of the timer running if the active_queue

9853

-+ * is idling inside its time slice.

9854

-+ */

9855

-+static void bfq_idle_slice_timer(unsigned long data)

9856

-+{

9857

-+	struct bfq_data *bfqd = (struct bfq_data *)data;

9858

-+	struct bfq_queue *bfqq;

9859

-+	unsigned long flags;

9860

-+	enum bfqq_expiration reason;

9861

-+

9862

-+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

9863

-+

9864

-+	bfqq = bfqd->active_queue;

9865

-+	/*

9866

-+	 * Theoretical race here: active_queue can be NULL or different

9867

-+	 * from the queue that was idling if the timer handler spins on

9868

-+	 * the queue_lock and a new request arrives for the current

9869

-+	 * queue and there is a full dispatch cycle that changes the

9870

-+	 * active_queue.  This can hardly happen, but in the worst case

9871

-+	 * we just expire a queue too early.

9872

-+	 */

9873

-+	if (bfqq != NULL) {

9874

-+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

9875

-+		if (bfq_bfqq_budget_timeout(bfqq))

9876

-+			/*

9877

-+			 * Also here the queue can be safely expired

9878

-+			 * for budget timeout without wasting

9879

-+			 * guarantees

9880

-+			 */

9881

-+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

9882

-+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

9883

-+			/*

9884

-+			 * The queue may not be empty upon timer expiration,

9885

-+			 * because we may not disable the timer when the first

9886

-+			 * request of the active queue arrives during

9887

-+			 * disk idling

9888

-+			 */

9889

-+			reason = BFQ_BFQQ_TOO_IDLE;

9890

-+		else

9891

-+			goto schedule_dispatch;

9892

-+

9893

-+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

9894

-+	}

9895

-+

9896

-+schedule_dispatch:

9897

-+	bfq_schedule_dispatch(bfqd);

9898

-+

9899

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

9900

-+}

9901

-+

9902

-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

9903

-+{

9904

-+	del_timer_sync(&bfqd->idle_slice_timer);

9905

-+	cancel_work_sync(&bfqd->unplug_work);

9906

-+}

9907

-+

9908

-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

9909

-+					struct bfq_queue **bfqq_ptr)

9910

-+{

9911

-+	struct bfq_group *root_group = bfqd->root_group;

9912

-+	struct bfq_queue *bfqq = *bfqq_ptr;

9913

-+

9914

-+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

9915

-+	if (bfqq != NULL) {

9916

-+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

9917

-+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

9918

-+			     bfqq, atomic_read(&bfqq->ref));

9919

-+		bfq_put_queue(bfqq);

9920

-+		*bfqq_ptr = NULL;

9921

-+	}

9922

-+}

9923

-+

9924

-+/*

9925

-+ * Release all the bfqg references to its async queues.  If we are

9926

-+ * deallocating the group these queues may still contain requests, so

9927

-+ * we reparent them to the root cgroup (i.e., the only one that will

9928

-+ * exist for sure untill all the requests on a device are gone).

9929

-+ */

9930

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

9931

-+{

9932

-+	int i, j;

9933

-+

9934

-+	for (i = 0; i < 2; i++)

9935

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

9936

-+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

9937

-+

9938

-+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

9939

-+}

9940

-+

9941

-+static void bfq_exit_queue(struct elevator_queue *e)

9942

-+{

9943

-+	struct bfq_data *bfqd = e->elevator_data;

9944

-+	struct request_queue *q = bfqd->queue;

9945

-+	struct bfq_queue *bfqq, *n;

9946

-+

9947

-+	bfq_shutdown_timer_wq(bfqd);

9948

-+

9949

-+	spin_lock_irq(q->queue_lock);

9950

-+

9951

-+	BUG_ON(bfqd->active_queue != NULL);

9952

-+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

9953

-+		bfq_deactivate_bfqq(bfqd, bfqq, 0);

9954

-+

9955

-+	bfq_disconnect_groups(bfqd);

9956

-+	spin_unlock_irq(q->queue_lock);

9957

-+

9958

-+	bfq_shutdown_timer_wq(bfqd);

9959

-+

9960

-+	synchronize_rcu();

9961

-+

9962

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

9963

-+

9964

-+	bfq_free_root_group(bfqd);

9965

-+	kfree(bfqd);

9966

-+}

9967

-+

9968

-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

9969

-+{

9970

-+	struct bfq_group *bfqg;

9971

-+	struct bfq_data *bfqd;

9972

-+	struct elevator_queue *eq;

9973

-+

9974

-+	eq = elevator_alloc(q, e);

9975

-+	if (eq == NULL)

9976

-+		return -ENOMEM;

9977

-+

9978

-+	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);

9979

-+	if (bfqd == NULL) {

9980

-+		kobject_put(&eq->kobj);

9981

-+		return -ENOMEM;

9982

-+	}

9983

-+	eq->elevator_data = bfqd;

9984

-+

9985

-+	/*

9986

-+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

9987

-+	 * Grab a permanent reference to it, so that the normal code flow

9988

-+	 * will not attempt to free it.

9989

-+	 */

9990

-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

9991

-+	atomic_inc(&bfqd->oom_bfqq.ref);

9992

-+

9993

-+	bfqd->queue = q;

9994

-+

9995

-+	spin_lock_irq(q->queue_lock);

9996

-+	q->elevator = eq;

9997

-+	spin_unlock_irq(q->queue_lock);

9998

-+

9999

-+	bfqg = bfq_alloc_root_group(bfqd, q->node);

10000

-+	if (bfqg == NULL) {

10001

-+		kfree(bfqd);

10002

-+		kobject_put(&eq->kobj);

10003

-+		return -ENOMEM;

10004

-+	}

10005

-+

10006

-+	bfqd->root_group = bfqg;

10007

-+

10008

-+	init_timer(&bfqd->idle_slice_timer);

10009

-+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

10010

-+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

10011

-+

10012

-+	bfqd->rq_pos_tree = RB_ROOT;

10013

-+

10014

-+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

10015

-+

10016

-+	INIT_LIST_HEAD(&bfqd->active_list);

10017

-+	INIT_LIST_HEAD(&bfqd->idle_list);

10018

-+

10019

-+	bfqd->hw_tag = -1;

10020

-+

10021

-+	bfqd->bfq_max_budget = bfq_default_max_budget;

10022

-+

10023

-+	bfqd->bfq_quantum = bfq_quantum;

10024

-+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

10025

-+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

10026

-+	bfqd->bfq_back_max = bfq_back_max;

10027

-+	bfqd->bfq_back_penalty = bfq_back_penalty;

10028

-+	bfqd->bfq_slice_idle = bfq_slice_idle;

10029

-+	bfqd->bfq_class_idle_last_service = 0;

10030

-+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

10031

-+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

10032

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

10033

-+

10034

-+	bfqd->low_latency = true;

10035

-+

10036

-+	bfqd->bfq_raising_coeff = 20;

10037

-+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

10038

-+	bfqd->bfq_raising_max_time = 0;

10039

-+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

10040

-+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

10041

-+	bfqd->bfq_raising_max_softrt_rate = 7000;

10042

-+

10043

-+	/* Initially estimate the device's peak rate as the reference rate */

10044

-+	if (blk_queue_nonrot(bfqd->queue)) {

10045

-+		bfqd->RT_prod = R_nonrot * T_nonrot;

10046

-+		bfqd->peak_rate = R_nonrot;

10047

-+	} else {

10048

-+		bfqd->RT_prod = R_rot * T_rot;

10049

-+		bfqd->peak_rate = R_rot;

10050

-+	}

10051

-+

10052

-+	return 0;

10053

-+}

10054

-+

10055

-+static void bfq_slab_kill(void)

10056

-+{

10057

-+	if (bfq_pool != NULL)

10058

-+		kmem_cache_destroy(bfq_pool);

10059

-+}

10060

-+

10061

-+static int __init bfq_slab_setup(void)

10062

-+{

10063

-+	bfq_pool = KMEM_CACHE(bfq_queue, 0);

10064

-+	if (bfq_pool == NULL)

10065

-+		return -ENOMEM;

10066

-+	return 0;

10067

-+}

10068

-+

10069

-+static ssize_t bfq_var_show(unsigned int var, char *page)

10070

-+{

10071

-+	return sprintf(page, "%d\n", var);

10072

-+}

10073

-+

10074

-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

10075

-+{

10076

-+	unsigned long new_val;

10077

-+	int ret = strict_strtoul(page, 10, &new_val);

10078

-+

10079

-+	if (ret == 0)

10080

-+		*var = new_val;

10081

-+

10082

-+	return count;

10083

-+}

10084

-+

10085

-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

10086

-+{

10087

-+	struct bfq_data *bfqd = e->elevator_data;

10088

-+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

10089

-+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

10090

-+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

10091

-+}

10092

-+

10093

-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

10094

-+{

10095

-+	struct bfq_queue *bfqq;

10096

-+	struct bfq_data *bfqd = e->elevator_data;

10097

-+	ssize_t num_char = 0;

10098

-+

10099

-+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

10100

-+			    bfqd->queued);

10101

-+

10102

-+	spin_lock_irq(bfqd->queue->queue_lock);

10103

-+

10104

-+	num_char += sprintf(page + num_char, "Active:\n");

10105

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

10106

-+	  num_char += sprintf(page + num_char,

10107

-+			      "pid%d: weight %hu, nr_queued %d %d,"

10108

-+			      " dur %d/%u\n",

10109

-+			      bfqq->pid,

10110

-+			      bfqq->entity.weight,

10111

-+			      bfqq->queued[0],

10112

-+			      bfqq->queued[1],

10113

-+			jiffies_to_msecs(jiffies -

10114

-+				bfqq->last_rais_start_finish),

10115

-+			jiffies_to_msecs(bfqq->raising_cur_max_time));

10116

-+	}

10117

-+

10118

-+	num_char += sprintf(page + num_char, "Idle:\n");

10119

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

10120

-+			num_char += sprintf(page + num_char,

10121

-+				"pid%d: weight %hu, dur %d/%u\n",

10122

-+				bfqq->pid,

10123

-+				bfqq->entity.weight,

10124

-+				jiffies_to_msecs(jiffies -

10125

-+					bfqq->last_rais_start_finish),

10126

-+				jiffies_to_msecs(bfqq->raising_cur_max_time));

10127

-+	}

10128

-+

10129

-+	spin_unlock_irq(bfqd->queue->queue_lock);

10130

-+

10131

-+	return num_char;

10132

-+}

10133

-+

10134

-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

10135

-+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

10136

-+{									\

10137

-+	struct bfq_data *bfqd = e->elevator_data;			\

10138

-+	unsigned int __data = __VAR;					\

10139

-+	if (__CONV)							\

10140

-+		__data = jiffies_to_msecs(__data);			\

10141

-+	return bfq_var_show(__data, (page));				\

10142

-+}

10143

-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

10144

-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

10145

-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

10146

-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

10147

-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

10148

-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

10149

-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

10150

-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

10151

-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

10152

-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

10153

-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

10154

-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

10155

-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

10156

-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

10157

-+	1);

10158

-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

10159

-+	bfqd->bfq_raising_min_inter_arr_async,

10160

-+	1);

10161

-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

10162

-+	bfqd->bfq_raising_max_softrt_rate, 0);

10163

-+#undef SHOW_FUNCTION

10164

-+

10165

-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

10166

-+static ssize_t								\

10167

-+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

10168

-+{									\

10169

-+	struct bfq_data *bfqd = e->elevator_data;			\

10170

-+	unsigned long uninitialized_var(__data);			\

10171

-+	int ret = bfq_var_store(&__data, (page), count);		\

10172

-+	if (__data < (MIN))						\

10173

-+		__data = (MIN);						\

10174

-+	else if (__data > (MAX))					\

10175

-+		__data = (MAX);						\

10176

-+	if (__CONV)							\

10177

-+		*(__PTR) = msecs_to_jiffies(__data);			\

10178

-+	else								\

10179

-+		*(__PTR) = __data;					\

10180

-+	return ret;							\

10181

-+}

10182

-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

10183

-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

10184

-+		INT_MAX, 1);

10185

-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

10186

-+		INT_MAX, 1);

10187

-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

10188

-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

10189

-+		INT_MAX, 0);

10190

-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

10191

-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

10192

-+		1, INT_MAX, 0);

10193

-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

10194

-+		INT_MAX, 1);

10195

-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

10196

-+		INT_MAX, 0);

10197

-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

10198

-+		INT_MAX, 1);

10199

-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

10200

-+		INT_MAX, 1);

10201

-+STORE_FUNCTION(bfq_raising_min_idle_time_store,

10202

-+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

10203

-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

10204

-+		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

10205

-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

10206

-+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

10207

-+#undef STORE_FUNCTION

10208

-+

10209

-+/* do nothing for the moment */

10210

-+static ssize_t bfq_weights_store(struct elevator_queue *e,

10211

-+				    const char *page, size_t count)

10212

-+{

10213

-+	return count;

10214

-+}

10215

-+

10216

-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

10217

-+{

10218

-+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

10219

-+

10220

-+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

10221

-+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

10222

-+	else

10223

-+		return bfq_default_max_budget;

10224

-+}

10225

-+

10226

-+static ssize_t bfq_max_budget_store(struct elevator_queue *e,

10227

-+				    const char *page, size_t count)

10228

-+{

10229

-+	struct bfq_data *bfqd = e->elevator_data;

10230

-+	unsigned long uninitialized_var(__data);

10231

-+	int ret = bfq_var_store(&__data, (page), count);

10232

-+

10233

-+	if (__data == 0)

10234

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10235

-+	else {

10236

-+		if (__data > INT_MAX)

10237

-+			__data = INT_MAX;

10238

-+		bfqd->bfq_max_budget = __data;

10239

-+	}

10240

-+

10241

-+	bfqd->bfq_user_max_budget = __data;

10242

-+

10243

-+	return ret;

10244

-+}

10245

-+

10246

-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

10247

-+				      const char *page, size_t count)

10248

-+{

10249

-+	struct bfq_data *bfqd = e->elevator_data;

10250

-+	unsigned long uninitialized_var(__data);

10251

-+	int ret = bfq_var_store(&__data, (page), count);

10252

-+

10253

-+	if (__data < 1)

10254

-+		__data = 1;

10255

-+	else if (__data > INT_MAX)

10256

-+		__data = INT_MAX;

10257

-+

10258

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

10259

-+	if (bfqd->bfq_user_max_budget == 0)

10260

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10261

-+

10262

-+	return ret;

10263

-+}

10264

-+

10265

-+static ssize_t bfq_low_latency_store(struct elevator_queue *e,

10266

-+				     const char *page, size_t count)

10267

-+{

10268

-+	struct bfq_data *bfqd = e->elevator_data;

10269

-+	unsigned long uninitialized_var(__data);

10270

-+	int ret = bfq_var_store(&__data, (page), count);

10271

-+

10272

-+	if (__data > 1)

10273

-+		__data = 1;

10274

-+	if (__data == 0 && bfqd->low_latency != 0)

10275

-+		bfq_end_raising(bfqd);

10276

-+	bfqd->low_latency = __data;

10277

-+

10278

-+	return ret;

10279

-+}

10280

-+

10281

-+#define BFQ_ATTR(name) \

10282

-+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

10283

-+

10284

-+static struct elv_fs_entry bfq_attrs[] = {

10285

-+	BFQ_ATTR(quantum),

10286

-+	BFQ_ATTR(fifo_expire_sync),

10287

-+	BFQ_ATTR(fifo_expire_async),

10288

-+	BFQ_ATTR(back_seek_max),

10289

-+	BFQ_ATTR(back_seek_penalty),

10290

-+	BFQ_ATTR(slice_idle),

10291

-+	BFQ_ATTR(max_budget),

10292

-+	BFQ_ATTR(max_budget_async_rq),

10293

-+	BFQ_ATTR(timeout_sync),

10294

-+	BFQ_ATTR(timeout_async),

10295

-+	BFQ_ATTR(low_latency),

10296

-+	BFQ_ATTR(raising_coeff),

10297

-+	BFQ_ATTR(raising_max_time),

10298

-+	BFQ_ATTR(raising_rt_max_time),

10299

-+	BFQ_ATTR(raising_min_idle_time),

10300

-+	BFQ_ATTR(raising_min_inter_arr_async),

10301

-+	BFQ_ATTR(raising_max_softrt_rate),

10302

-+	BFQ_ATTR(weights),

10303

-+	__ATTR_NULL

10304

-+};

10305

-+

10306

-+static struct elevator_type iosched_bfq = {

10307

-+	.ops = {

10308

-+		.elevator_merge_fn =		bfq_merge,

10309

-+		.elevator_merged_fn =		bfq_merged_request,

10310

-+		.elevator_merge_req_fn =	bfq_merged_requests,

10311

-+		.elevator_allow_merge_fn =	bfq_allow_merge,

10312

-+		.elevator_dispatch_fn =		bfq_dispatch_requests,

10313

-+		.elevator_add_req_fn =		bfq_insert_request,

10314

-+		.elevator_activate_req_fn =	bfq_activate_request,

10315

-+		.elevator_deactivate_req_fn =	bfq_deactivate_request,

10316

-+		.elevator_completed_req_fn =	bfq_completed_request,

10317

-+		.elevator_former_req_fn =	elv_rb_former_request,

10318

-+		.elevator_latter_req_fn =	elv_rb_latter_request,

10319

-+		.elevator_init_icq_fn =		bfq_init_icq,

10320

-+		.elevator_exit_icq_fn =		bfq_exit_icq,

10321

-+		.elevator_set_req_fn =		bfq_set_request,

10322

-+		.elevator_put_req_fn =		bfq_put_request,

10323

-+		.elevator_may_queue_fn =	bfq_may_queue,

10324

-+		.elevator_init_fn =		bfq_init_queue,

10325

-+		.elevator_exit_fn =		bfq_exit_queue,

10326

-+	},

10327

-+	.icq_size =		sizeof(struct bfq_io_cq),

10328

-+	.icq_align =		__alignof__(struct bfq_io_cq),

10329

-+	.elevator_attrs =	bfq_attrs,

10330

-+	.elevator_name =	"bfq",

10331

-+	.elevator_owner =	THIS_MODULE,

10332

-+};

10333

-+

10334

-+static int __init bfq_init(void)

10335

-+{

10336

-+	/*

10337

-+	 * Can be 0 on HZ < 1000 setups.

10338

-+	 */

10339

-+	if (bfq_slice_idle == 0)

10340

-+		bfq_slice_idle = 1;

10341

-+

10342

-+	if (bfq_timeout_async == 0)

10343

-+		bfq_timeout_async = 1;

10344

-+

10345

-+	if (bfq_slab_setup())

10346

-+		return -ENOMEM;

10347

-+

10348

-+	elv_register(&iosched_bfq);

10349

-+

10350

-+	return 0;

10351

-+}

10352

-+

10353

-+static void __exit bfq_exit(void)

10354

-+{

10355

-+	elv_unregister(&iosched_bfq);

10356

-+	bfq_slab_kill();

10357

-+}

10358

-+

10359

-+module_init(bfq_init);

10360

-+module_exit(bfq_exit);

10361

-+

10362

-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

10363

-+MODULE_LICENSE("GPL");

10364

-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

10365

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

10366

-new file mode 100644

10367

-index 0000000..03f8061

10368

---- /dev/null

10369

-+++ b/block/bfq-sched.c

10370

-@@ -0,0 +1,1072 @@

10371

-+/*

10372

-+ * BFQ: Hierarchical B-WF2Q+ scheduler.

10373

-+ *

10374

-+ * Based on ideas and code from CFQ:

10375

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

10376

-+ *

10377

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

10378

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

10379

-+ *

10380

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

10381

-+ */

10382

-+

10383

-+#ifdef CONFIG_CGROUP_BFQIO

10384

-+#define for_each_entity(entity)	\

10385

-+	for (; entity != NULL; entity = entity->parent)

10386

-+

10387

-+#define for_each_entity_safe(entity, parent) \

10388

-+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

10389

-+

10390

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

10391

-+						 int extract,

10392

-+						 struct bfq_data *bfqd);

10393

-+

10394

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

10395

-+{

10396

-+	struct bfq_entity *bfqg_entity;

10397

-+	struct bfq_group *bfqg;

10398

-+	struct bfq_sched_data *group_sd;

10399

-+

10400

-+	BUG_ON(next_active == NULL);

10401

-+

10402

-+	group_sd = next_active->sched_data;

10403

-+

10404

-+	bfqg = container_of(group_sd, struct bfq_group, sched_data);

10405

-+	/*

10406

-+	 * bfq_group's my_entity field is not NULL only if the group

10407

-+	 * is not the root group. We must not touch the root entity

10408

-+	 * as it must never become an active entity.

10409

-+	 */

10410

-+	bfqg_entity = bfqg->my_entity;

10411

-+	if (bfqg_entity != NULL)

10412

-+		bfqg_entity->budget = next_active->budget;

10413

-+}

10414

-+

10415

-+static int bfq_update_next_active(struct bfq_sched_data *sd)

10416

-+{

10417

-+	struct bfq_entity *next_active;

10418

-+

10419

-+	if (sd->active_entity != NULL)

10420

-+		/* will update/requeue at the end of service */

10421

-+		return 0;

10422

-+

10423

-+	/*

10424

-+	 * NOTE: this can be improved in many ways, such as returning

10425

-+	 * 1 (and thus propagating upwards the update) only when the

10426

-+	 * budget changes, or caching the bfqq that will be scheduled

10427

-+	 * next from this subtree.  By now we worry more about

10428

-+	 * correctness than about performance...

10429

-+	 */

10430

-+	next_active = bfq_lookup_next_entity(sd, 0, NULL);

10431

-+	sd->next_active = next_active;

10432

-+

10433

-+	if (next_active != NULL)

10434

-+		bfq_update_budget(next_active);

10435

-+

10436

-+	return 1;

10437

-+}

10438

-+

10439

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

10440

-+					 struct bfq_entity *entity)

10441

-+{

10442

-+	BUG_ON(sd->next_active != entity);

10443

-+}

10444

-+#else

10445

-+#define for_each_entity(entity)	\

10446

-+	for (; entity != NULL; entity = NULL)

10447

-+

10448

-+#define for_each_entity_safe(entity, parent) \

10449

-+	for (parent = NULL; entity != NULL; entity = parent)

10450

-+

10451

-+static inline int bfq_update_next_active(struct bfq_sched_data *sd)

10452

-+{

10453

-+	return 0;

10454

-+}

10455

-+

10456

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

10457

-+					 struct bfq_entity *entity)

10458

-+{

10459

-+}

10460

-+

10461

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

10462

-+{

10463

-+}

10464

-+#endif

10465

-+

10466

-+/*

10467

-+ * Shift for timestamp calculations.  This actually limits the maximum

10468

-+ * service allowed in one timestamp delta (small shift values increase it),

10469

-+ * the maximum total weight that can be used for the queues in the system

10470

-+ * (big shift values increase it), and the period of virtual time wraparounds.

10471

-+ */

10472

-+#define WFQ_SERVICE_SHIFT	22

10473

-+

10474

-+/**

10475

-+ * bfq_gt - compare two timestamps.

10476

-+ * @a: first ts.

10477

-+ * @b: second ts.

10478

-+ *

10479

-+ * Return @a > @b, dealing with wrapping correctly.

10480

-+ */

10481

-+static inline int bfq_gt(u64 a, u64 b)

10482

-+{

10483

-+	return (s64)(a - b) > 0;

10484

-+}

10485

-+

10486

-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

10487

-+{

10488

-+	struct bfq_queue *bfqq = NULL;

10489

-+

10490

-+	BUG_ON(entity == NULL);

10491

-+

10492

-+	if (entity->my_sched_data == NULL)

10493

-+		bfqq = container_of(entity, struct bfq_queue, entity);

10494

-+

10495

-+	return bfqq;

10496

-+}

10497

-+

10498

-+

10499

-+/**

10500

-+ * bfq_delta - map service into the virtual time domain.

10501

-+ * @service: amount of service.

10502

-+ * @weight: scale factor (weight of an entity or weight sum).

10503

-+ */

10504

-+static inline u64 bfq_delta(unsigned long service,

10505

-+					unsigned long weight)

10506

-+{

10507

-+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

10508

-+

10509

-+	do_div(d, weight);

10510

-+	return d;

10511

-+}

10512

-+

10513

-+/**

10514

-+ * bfq_calc_finish - assign the finish time to an entity.

10515

-+ * @entity: the entity to act upon.

10516

-+ * @service: the service to be charged to the entity.

10517

-+ */

10518

-+static inline void bfq_calc_finish(struct bfq_entity *entity,

10519

-+				   unsigned long service)

10520

-+{

10521

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10522

-+

10523

-+	BUG_ON(entity->weight == 0);

10524

-+

10525

-+	entity->finish = entity->start +

10526

-+		bfq_delta(service, entity->weight);

10527

-+

10528

-+	if (bfqq != NULL) {

10529

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

10530

-+			"calc_finish: serv %lu, w %d",

10531

-+			service, entity->weight);

10532

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

10533

-+			"calc_finish: start %llu, finish %llu, delta %llu",

10534

-+			entity->start, entity->finish,

10535

-+			bfq_delta(service, entity->weight));

10536

-+	}

10537

-+}

10538

-+

10539

-+/**

10540

-+ * bfq_entity_of - get an entity from a node.

10541

-+ * @node: the node field of the entity.

10542

-+ *

10543

-+ * Convert a node pointer to the relative entity.  This is used only

10544

-+ * to simplify the logic of some functions and not as the generic

10545

-+ * conversion mechanism because, e.g., in the tree walking functions,

10546

-+ * the check for a %NULL value would be redundant.

10547

-+ */

10548

-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

10549

-+{

10550

-+	struct bfq_entity *entity = NULL;

10551

-+

10552

-+	if (node != NULL)

10553

-+		entity = rb_entry(node, struct bfq_entity, rb_node);

10554

-+

10555

-+	return entity;

10556

-+}

10557

-+

10558

-+/**

10559

-+ * bfq_extract - remove an entity from a tree.

10560

-+ * @root: the tree root.

10561

-+ * @entity: the entity to remove.

10562

-+ */

10563

-+static inline void bfq_extract(struct rb_root *root,

10564

-+			       struct bfq_entity *entity)

10565

-+{

10566

-+	BUG_ON(entity->tree != root);

10567

-+

10568

-+	entity->tree = NULL;

10569

-+	rb_erase(&entity->rb_node, root);

10570

-+}

10571

-+

10572

-+/**

10573

-+ * bfq_idle_extract - extract an entity from the idle tree.

10574

-+ * @st: the service tree of the owning @entity.

10575

-+ * @entity: the entity being removed.

10576

-+ */

10577

-+static void bfq_idle_extract(struct bfq_service_tree *st,

10578

-+			     struct bfq_entity *entity)

10579

-+{

10580

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10581

-+	struct rb_node *next;

10582

-+

10583

-+	BUG_ON(entity->tree != &st->idle);

10584

-+

10585

-+	if (entity == st->first_idle) {

10586

-+		next = rb_next(&entity->rb_node);

10587

-+		st->first_idle = bfq_entity_of(next);

10588

-+	}

10589

-+

10590

-+	if (entity == st->last_idle) {

10591

-+		next = rb_prev(&entity->rb_node);

10592

-+		st->last_idle = bfq_entity_of(next);

10593

-+	}

10594

-+

10595

-+	bfq_extract(&st->idle, entity);

10596

-+

10597

-+	if (bfqq != NULL)

10598

-+		list_del(&bfqq->bfqq_list);

10599

-+}

10600

-+

10601

-+/**

10602

-+ * bfq_insert - generic tree insertion.

10603

-+ * @root: tree root.

10604

-+ * @entity: entity to insert.

10605

-+ *

10606

-+ * This is used for the idle and the active tree, since they are both

10607

-+ * ordered by finish time.

10608

-+ */

10609

-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

10610

-+{

10611

-+	struct bfq_entity *entry;

10612

-+	struct rb_node **node = &root->rb_node;

10613

-+	struct rb_node *parent = NULL;

10614

-+

10615

-+	BUG_ON(entity->tree != NULL);

10616

-+

10617

-+	while (*node != NULL) {

10618

-+		parent = *node;

10619

-+		entry = rb_entry(parent, struct bfq_entity, rb_node);

10620

-+

10621

-+		if (bfq_gt(entry->finish, entity->finish))

10622

-+			node = &parent->rb_left;

10623

-+		else

10624

-+			node = &parent->rb_right;

10625

-+	}

10626

-+

10627

-+	rb_link_node(&entity->rb_node, parent, node);

10628

-+	rb_insert_color(&entity->rb_node, root);

10629

-+

10630

-+	entity->tree = root;

10631

-+}

10632

-+

10633

-+/**

10634

-+ * bfq_update_min - update the min_start field of a entity.

10635

-+ * @entity: the entity to update.

10636

-+ * @node: one of its children.

10637

-+ *

10638

-+ * This function is called when @entity may store an invalid value for

10639

-+ * min_start due to updates to the active tree.  The function  assumes

10640

-+ * that the subtree rooted at @node (which may be its left or its right

10641

-+ * child) has a valid min_start value.

10642

-+ */

10643

-+static inline void bfq_update_min(struct bfq_entity *entity,

10644

-+				  struct rb_node *node)

10645

-+{

10646

-+	struct bfq_entity *child;

10647

-+

10648

-+	if (node != NULL) {

10649

-+		child = rb_entry(node, struct bfq_entity, rb_node);

10650

-+		if (bfq_gt(entity->min_start, child->min_start))

10651

-+			entity->min_start = child->min_start;

10652

-+	}

10653

-+}

10654

-+

10655

-+/**

10656

-+ * bfq_update_active_node - recalculate min_start.

10657

-+ * @node: the node to update.

10658

-+ *

10659

-+ * @node may have changed position or one of its children may have moved,

10660

-+ * this function updates its min_start value.  The left and right subtrees

10661

-+ * are assumed to hold a correct min_start value.

10662

-+ */

10663

-+static inline void bfq_update_active_node(struct rb_node *node)

10664

-+{

10665

-+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

10666

-+

10667

-+	entity->min_start = entity->start;

10668

-+	bfq_update_min(entity, node->rb_right);

10669

-+	bfq_update_min(entity, node->rb_left);

10670

-+}

10671

-+

10672

-+/**

10673

-+ * bfq_update_active_tree - update min_start for the whole active tree.

10674

-+ * @node: the starting node.

10675

-+ *

10676

-+ * @node must be the deepest modified node after an update.  This function

10677

-+ * updates its min_start using the values held by its children, assuming

10678

-+ * that they did not change, and then updates all the nodes that may have

10679

-+ * changed in the path to the root.  The only nodes that may have changed

10680

-+ * are the ones in the path or their siblings.

10681

-+ */

10682

-+static void bfq_update_active_tree(struct rb_node *node)

10683

-+{

10684

-+	struct rb_node *parent;

10685

-+

10686

-+up:

10687

-+	bfq_update_active_node(node);

10688

-+

10689

-+	parent = rb_parent(node);

10690

-+	if (parent == NULL)

10691

-+		return;

10692

-+

10693

-+	if (node == parent->rb_left && parent->rb_right != NULL)

10694

-+		bfq_update_active_node(parent->rb_right);

10695

-+	else if (parent->rb_left != NULL)

10696

-+		bfq_update_active_node(parent->rb_left);

10697

-+

10698

-+	node = parent;

10699

-+	goto up;

10700

-+}

10701

-+

10702

-+/**

10703

-+ * bfq_active_insert - insert an entity in the active tree of its group/device.

10704

-+ * @st: the service tree of the entity.

10705

-+ * @entity: the entity being inserted.

10706

-+ *

10707

-+ * The active tree is ordered by finish time, but an extra key is kept

10708

-+ * per each node, containing the minimum value for the start times of

10709

-+ * its children (and the node itself), so it's possible to search for

10710

-+ * the eligible node with the lowest finish time in logarithmic time.

10711

-+ */

10712

-+static void bfq_active_insert(struct bfq_service_tree *st,

10713

-+			      struct bfq_entity *entity)

10714

-+{

10715

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10716

-+	struct rb_node *node = &entity->rb_node;

10717

-+

10718

-+	bfq_insert(&st->active, entity);

10719

-+

10720

-+	if (node->rb_left != NULL)

10721

-+		node = node->rb_left;

10722

-+	else if (node->rb_right != NULL)

10723

-+		node = node->rb_right;

10724

-+

10725

-+	bfq_update_active_tree(node);

10726

-+

10727

-+	if (bfqq != NULL)

10728

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

10729

-+}

10730

-+

10731

-+/**

10732

-+ * bfq_ioprio_to_weight - calc a weight from an ioprio.

10733

-+ * @ioprio: the ioprio value to convert.

10734

-+ */

10735

-+static unsigned short bfq_ioprio_to_weight(int ioprio)

10736

-+{

10737

-+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

10738

-+	return IOPRIO_BE_NR - ioprio;

10739

-+}

10740

-+

10741

-+/**

10742

-+ * bfq_weight_to_ioprio - calc an ioprio from a weight.

10743

-+ * @weight: the weight value to convert.

10744

-+ *

10745

-+ * To preserve as mush as possible the old only-ioprio user interface,

10746

-+ * 0 is used as an escape ioprio value for weights (numerically) equal or

10747

-+ * larger than IOPRIO_BE_NR

10748

-+ */

10749

-+static unsigned short bfq_weight_to_ioprio(int weight)

10750

-+{

10751

-+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

10752

-+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

10753

-+}

10754

-+

10755

-+static inline void bfq_get_entity(struct bfq_entity *entity)

10756

-+{

10757

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10758

-+	struct bfq_sched_data *sd;

10759

-+

10760

-+	if (bfqq != NULL) {

10761

-+		sd = entity->sched_data;

10762

-+		atomic_inc(&bfqq->ref);

10763

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

10764

-+			     bfqq, atomic_read(&bfqq->ref));

10765

-+	}

10766

-+}

10767

-+

10768

-+/**

10769

-+ * bfq_find_deepest - find the deepest node that an extraction can modify.

10770

-+ * @node: the node being removed.

10771

-+ *

10772

-+ * Do the first step of an extraction in an rb tree, looking for the

10773

-+ * node that will replace @node, and returning the deepest node that

10774

-+ * the following modifications to the tree can touch.  If @node is the

10775

-+ * last node in the tree return %NULL.

10776

-+ */

10777

-+static struct rb_node *bfq_find_deepest(struct rb_node *node)

10778

-+{

10779

-+	struct rb_node *deepest;

10780

-+

10781

-+	if (node->rb_right == NULL && node->rb_left == NULL)

10782

-+		deepest = rb_parent(node);

10783

-+	else if (node->rb_right == NULL)

10784

-+		deepest = node->rb_left;

10785

-+	else if (node->rb_left == NULL)

10786

-+		deepest = node->rb_right;

10787

-+	else {

10788

-+		deepest = rb_next(node);

10789

-+		if (deepest->rb_right != NULL)

10790

-+			deepest = deepest->rb_right;

10791

-+		else if (rb_parent(deepest) != node)

10792

-+			deepest = rb_parent(deepest);

10793

-+	}

10794

-+

10795

-+	return deepest;

10796

-+}

10797

-+

10798

-+/**

10799

-+ * bfq_active_extract - remove an entity from the active tree.

10800

-+ * @st: the service_tree containing the tree.

10801

-+ * @entity: the entity being removed.

10802

-+ */

10803

-+static void bfq_active_extract(struct bfq_service_tree *st,

10804

-+			       struct bfq_entity *entity)

10805

-+{

10806

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10807

-+	struct rb_node *node;

10808

-+

10809

-+	node = bfq_find_deepest(&entity->rb_node);

10810

-+	bfq_extract(&st->active, entity);

10811

-+

10812

-+	if (node != NULL)

10813

-+		bfq_update_active_tree(node);

10814

-+

10815

-+	if (bfqq != NULL)

10816

-+		list_del(&bfqq->bfqq_list);

10817

-+}

10818

-+

10819

-+/**

10820

-+ * bfq_idle_insert - insert an entity into the idle tree.

10821

-+ * @st: the service tree containing the tree.

10822

-+ * @entity: the entity to insert.

10823

-+ */

10824

-+static void bfq_idle_insert(struct bfq_service_tree *st,

10825

-+			    struct bfq_entity *entity)

10826

-+{

10827

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10828

-+	struct bfq_entity *first_idle = st->first_idle;

10829

-+	struct bfq_entity *last_idle = st->last_idle;

10830

-+

10831

-+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

10832

-+		st->first_idle = entity;

10833

-+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

10834

-+		st->last_idle = entity;

10835

-+

10836

-+	bfq_insert(&st->idle, entity);

10837

-+

10838

-+	if (bfqq != NULL)

10839

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

10840

-+}

10841

-+

10842

-+/**

10843

-+ * bfq_forget_entity - remove an entity from the wfq trees.

10844

-+ * @st: the service tree.

10845

-+ * @entity: the entity being removed.

10846

-+ *

10847

-+ * Update the device status and forget everything about @entity, putting

10848

-+ * the device reference to it, if it is a queue.  Entities belonging to

10849

-+ * groups are not refcounted.

10850

-+ */

10851

-+static void bfq_forget_entity(struct bfq_service_tree *st,

10852

-+			      struct bfq_entity *entity)

10853

-+{

10854

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10855

-+	struct bfq_sched_data *sd;

10856

-+

10857

-+	BUG_ON(!entity->on_st);

10858

-+

10859

-+	entity->on_st = 0;

10860

-+	st->wsum -= entity->weight;

10861

-+	if (bfqq != NULL) {

10862

-+		sd = entity->sched_data;

10863

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

10864

-+			     bfqq, atomic_read(&bfqq->ref));

10865

-+		bfq_put_queue(bfqq);

10866

-+	}

10867

-+}

10868

-+

10869

-+/**

10870

-+ * bfq_put_idle_entity - release the idle tree ref of an entity.

10871

-+ * @st: service tree for the entity.

10872

-+ * @entity: the entity being released.

10873

-+ */

10874

-+static void bfq_put_idle_entity(struct bfq_service_tree *st,

10875

-+				struct bfq_entity *entity)

10876

-+{

10877

-+	bfq_idle_extract(st, entity);

10878

-+	bfq_forget_entity(st, entity);

10879

-+}

10880

-+

10881

-+/**

10882

-+ * bfq_forget_idle - update the idle tree if necessary.

10883

-+ * @st: the service tree to act upon.

10884

-+ *

10885

-+ * To preserve the global O(log N) complexity we only remove one entry here;

10886

-+ * as the idle tree will not grow indefinitely this can be done safely.

10887

-+ */

10888

-+static void bfq_forget_idle(struct bfq_service_tree *st)

10889

-+{

10890

-+	struct bfq_entity *first_idle = st->first_idle;

10891

-+	struct bfq_entity *last_idle = st->last_idle;

10892

-+

10893

-+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

10894

-+	    !bfq_gt(last_idle->finish, st->vtime)) {

10895

-+		/*

10896

-+		 * Forget the whole idle tree, increasing the vtime past

10897

-+		 * the last finish time of idle entities.

10898

-+		 */

10899

-+		st->vtime = last_idle->finish;

10900

-+	}

10901

-+

10902

-+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

10903

-+		bfq_put_idle_entity(st, first_idle);

10904

-+}

10905

-+

10906

-+static struct bfq_service_tree *

10907

-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

10908

-+			 struct bfq_entity *entity)

10909

-+{

10910

-+	struct bfq_service_tree *new_st = old_st;

10911

-+

10912

-+	if (entity->ioprio_changed) {

10913

-+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10914

-+

10915

-+		BUG_ON(old_st->wsum < entity->weight);

10916

-+		old_st->wsum -= entity->weight;

10917

-+

10918

-+		if (entity->new_weight != entity->orig_weight) {

10919

-+			entity->orig_weight = entity->new_weight;

10920

-+			entity->ioprio =

10921

-+				bfq_weight_to_ioprio(entity->orig_weight);

10922

-+		} else if (entity->new_ioprio != entity->ioprio) {

10923

-+			entity->ioprio = entity->new_ioprio;

10924

-+			entity->orig_weight =

10925

-+					bfq_ioprio_to_weight(entity->ioprio);

10926

-+		} else

10927

-+			entity->new_weight = entity->orig_weight =

10928

-+				bfq_ioprio_to_weight(entity->ioprio);

10929

-+

10930

-+		entity->ioprio_class = entity->new_ioprio_class;

10931

-+		entity->ioprio_changed = 0;

10932

-+

10933

-+		/*

10934

-+		 * NOTE: here we may be changing the weight too early,

10935

-+		 * this will cause unfairness.  The correct approach

10936

-+		 * would have required additional complexity to defer

10937

-+		 * weight changes to the proper time instants (i.e.,

10938

-+		 * when entity->finish <= old_st->vtime).

10939

-+		 */

10940

-+		new_st = bfq_entity_service_tree(entity);

10941

-+		entity->weight = entity->orig_weight *

10942

-+			(bfqq != NULL ? bfqq->raising_coeff : 1);

10943

-+		new_st->wsum += entity->weight;

10944

-+

10945

-+		if (new_st != old_st)

10946

-+			entity->start = new_st->vtime;

10947

-+	}

10948

-+

10949

-+	return new_st;

10950

-+}

10951

-+

10952

-+/**

10953

-+ * bfq_bfqq_served - update the scheduler status after selection for service.

10954

-+ * @bfqq: the queue being served.

10955

-+ * @served: bytes to transfer.

10956

-+ *

10957

-+ * NOTE: this can be optimized, as the timestamps of upper level entities

10958

-+ * are synchronized every time a new bfqq is selected for service.  By now,

10959

-+ * we keep it to better check consistency.

10960

-+ */

10961

-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

10962

-+{

10963

-+	struct bfq_entity *entity = &bfqq->entity;

10964

-+	struct bfq_service_tree *st;

10965

-+

10966

-+	for_each_entity(entity) {

10967

-+		st = bfq_entity_service_tree(entity);

10968

-+

10969

-+		entity->service += served;

10970

-+		BUG_ON(entity->service > entity->budget);

10971

-+		BUG_ON(st->wsum == 0);

10972

-+

10973

-+		st->vtime += bfq_delta(served, st->wsum);

10974

-+		bfq_forget_idle(st);

10975

-+	}

10976

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

10977

-+}

10978

-+

10979

-+/**

10980

-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

10981

-+ * @bfqq: the queue that needs a service update.

10982

-+ *

10983

-+ * When it's not possible to be fair in the service domain, because

10984

-+ * a queue is not consuming its budget fast enough (the meaning of

10985

-+ * fast depends on the timeout parameter), we charge it a full

10986

-+ * budget.  In this way we should obtain a sort of time-domain

10987

-+ * fairness among all the seeky/slow queues.

10988

-+ */

10989

-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

10990

-+{

10991

-+	struct bfq_entity *entity = &bfqq->entity;

10992

-+

10993

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

10994

-+

10995

-+	bfq_bfqq_served(bfqq, entity->budget - entity->service);

10996

-+}

10997

-+

10998

-+/**

10999

-+ * __bfq_activate_entity - activate an entity.

11000

-+ * @entity: the entity being activated.

11001

-+ *

11002

-+ * Called whenever an entity is activated, i.e., it is not active and one

11003

-+ * of its children receives a new request, or has to be reactivated due to

11004

-+ * budget exhaustion.  It uses the current budget of the entity (and the

11005

-+ * service received if @entity is active) of the queue to calculate its

11006

-+ * timestamps.

11007

-+ */

11008

-+static void __bfq_activate_entity(struct bfq_entity *entity)

11009

-+{

11010

-+	struct bfq_sched_data *sd = entity->sched_data;

11011

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11012

-+

11013

-+	if (entity == sd->active_entity) {

11014

-+		BUG_ON(entity->tree != NULL);

11015

-+		/*

11016

-+		 * If we are requeueing the current entity we have

11017

-+		 * to take care of not charging to it service it has

11018

-+		 * not received.

11019

-+		 */

11020

-+		bfq_calc_finish(entity, entity->service);

11021

-+		entity->start = entity->finish;

11022

-+		sd->active_entity = NULL;

11023

-+	} else if (entity->tree == &st->active) {

11024

-+		/*

11025

-+		 * Requeueing an entity due to a change of some

11026

-+		 * next_active entity below it.  We reuse the old

11027

-+		 * start time.

11028

-+		 */

11029

-+		bfq_active_extract(st, entity);

11030

-+	} else if (entity->tree == &st->idle) {

11031

-+		/*

11032

-+		 * Must be on the idle tree, bfq_idle_extract() will

11033

-+		 * check for that.

11034

-+		 */

11035

-+		bfq_idle_extract(st, entity);

11036

-+		entity->start = bfq_gt(st->vtime, entity->finish) ?

11037

-+				       st->vtime : entity->finish;

11038

-+	} else {

11039

-+		/*

11040

-+		 * The finish time of the entity may be invalid, and

11041

-+		 * it is in the past for sure, otherwise the queue

11042

-+		 * would have been on the idle tree.

11043

-+		 */

11044

-+		entity->start = st->vtime;

11045

-+		st->wsum += entity->weight;

11046

-+		bfq_get_entity(entity);

11047

-+

11048

-+		BUG_ON(entity->on_st);

11049

-+		entity->on_st = 1;

11050

-+	}

11051

-+

11052

-+	st = __bfq_entity_update_weight_prio(st, entity);

11053

-+	bfq_calc_finish(entity, entity->budget);

11054

-+	bfq_active_insert(st, entity);

11055

-+}

11056

-+

11057

-+/**

11058

-+ * bfq_activate_entity - activate an entity and its ancestors if necessary.

11059

-+ * @entity: the entity to activate.

11060

-+ *

11061

-+ * Activate @entity and all the entities on the path from it to the root.

11062

-+ */

11063

-+static void bfq_activate_entity(struct bfq_entity *entity)

11064

-+{

11065

-+	struct bfq_sched_data *sd;

11066

-+

11067

-+	for_each_entity(entity) {

11068

-+		__bfq_activate_entity(entity);

11069

-+

11070

-+		sd = entity->sched_data;

11071

-+		if (!bfq_update_next_active(sd))

11072

-+			/*

11073

-+			 * No need to propagate the activation to the

11074

-+			 * upper entities, as they will be updated when

11075

-+			 * the active entity is rescheduled.

11076

-+			 */

11077

-+			break;

11078

-+	}

11079

-+}

11080

-+

11081

-+/**

11082

-+ * __bfq_deactivate_entity - deactivate an entity from its service tree.

11083

-+ * @entity: the entity to deactivate.

11084

-+ * @requeue: if false, the entity will not be put into the idle tree.

11085

-+ *

11086

-+ * Deactivate an entity, independently from its previous state.  If the

11087

-+ * entity was not on a service tree just return, otherwise if it is on

11088

-+ * any scheduler tree, extract it from that tree, and if necessary

11089

-+ * and if the caller did not specify @requeue, put it on the idle tree.

11090

-+ *

11091

-+ * Return %1 if the caller should update the entity hierarchy, i.e.,

11092

-+ * if the entity was under service or if it was the next_active for

11093

-+ * its sched_data; return %0 otherwise.

11094

-+ */

11095

-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11096

-+{

11097

-+	struct bfq_sched_data *sd = entity->sched_data;

11098

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11099

-+	int was_active = entity == sd->active_entity;

11100

-+	int ret = 0;

11101

-+

11102

-+	if (!entity->on_st)

11103

-+		return 0;

11104

-+

11105

-+	BUG_ON(was_active && entity->tree != NULL);

11106

-+

11107

-+	if (was_active) {

11108

-+		bfq_calc_finish(entity, entity->service);

11109

-+		sd->active_entity = NULL;

11110

-+	} else if (entity->tree == &st->active)

11111

-+		bfq_active_extract(st, entity);

11112

-+	else if (entity->tree == &st->idle)

11113

-+		bfq_idle_extract(st, entity);

11114

-+	else if (entity->tree != NULL)

11115

-+		BUG();

11116

-+

11117

-+	if (was_active || sd->next_active == entity)

11118

-+		ret = bfq_update_next_active(sd);

11119

-+

11120

-+	if (!requeue || !bfq_gt(entity->finish, st->vtime))

11121

-+		bfq_forget_entity(st, entity);

11122

-+	else

11123

-+		bfq_idle_insert(st, entity);

11124

-+

11125

-+	BUG_ON(sd->active_entity == entity);

11126

-+	BUG_ON(sd->next_active == entity);

11127

-+

11128

-+	return ret;

11129

-+}

11130

-+

11131

-+/**

11132

-+ * bfq_deactivate_entity - deactivate an entity.

11133

-+ * @entity: the entity to deactivate.

11134

-+ * @requeue: true if the entity can be put on the idle tree

11135

-+ */

11136

-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11137

-+{

11138

-+	struct bfq_sched_data *sd;

11139

-+	struct bfq_entity *parent;

11140

-+

11141

-+	for_each_entity_safe(entity, parent) {

11142

-+		sd = entity->sched_data;

11143

-+

11144

-+		if (!__bfq_deactivate_entity(entity, requeue))

11145

-+			/*

11146

-+			 * The parent entity is still backlogged, and

11147

-+			 * we don't need to update it as it is still

11148

-+			 * under service.

11149

-+			 */

11150

-+			break;

11151

-+

11152

-+		if (sd->next_active != NULL)

11153

-+			/*

11154

-+			 * The parent entity is still backlogged and

11155

-+			 * the budgets on the path towards the root

11156

-+			 * need to be updated.

11157

-+			 */

11158

-+			goto update;

11159

-+

11160

-+		/*

11161

-+		 * If we reach there the parent is no more backlogged and

11162

-+		 * we want to propagate the dequeue upwards.

11163

-+		 */

11164

-+		requeue = 1;

11165

-+	}

11166

-+

11167

-+	return;

11168

-+

11169

-+update:

11170

-+	entity = parent;

11171

-+	for_each_entity(entity) {

11172

-+		__bfq_activate_entity(entity);

11173

-+

11174

-+		sd = entity->sched_data;

11175

-+		if (!bfq_update_next_active(sd))

11176

-+			break;

11177

-+	}

11178

-+}

11179

-+

11180

-+/**

11181

-+ * bfq_update_vtime - update vtime if necessary.

11182

-+ * @st: the service tree to act upon.

11183

-+ *

11184

-+ * If necessary update the service tree vtime to have at least one

11185

-+ * eligible entity, skipping to its start time.  Assumes that the

11186

-+ * active tree of the device is not empty.

11187

-+ *

11188

-+ * NOTE: this hierarchical implementation updates vtimes quite often,

11189

-+ * we may end up with reactivated tasks getting timestamps after a

11190

-+ * vtime skip done because we needed a ->first_active entity on some

11191

-+ * intermediate node.

11192

-+ */

11193

-+static void bfq_update_vtime(struct bfq_service_tree *st)

11194

-+{

11195

-+	struct bfq_entity *entry;

11196

-+	struct rb_node *node = st->active.rb_node;

11197

-+

11198

-+	entry = rb_entry(node, struct bfq_entity, rb_node);

11199

-+	if (bfq_gt(entry->min_start, st->vtime)) {

11200

-+		st->vtime = entry->min_start;

11201

-+		bfq_forget_idle(st);

11202

-+	}

11203

-+}

11204

-+

11205

-+/**

11206

-+ * bfq_first_active - find the eligible entity with the smallest finish time

11207

-+ * @st: the service tree to select from.

11208

-+ *

11209

-+ * This function searches the first schedulable entity, starting from the

11210

-+ * root of the tree and going on the left every time on this side there is

11211

-+ * a subtree with at least one eligible (start >= vtime) entity.  The path

11212

-+ * on the right is followed only if a) the left subtree contains no eligible

11213

-+ * entities and b) no eligible entity has been found yet.

11214

-+ */

11215

-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

11216

-+{

11217

-+	struct bfq_entity *entry, *first = NULL;

11218

-+	struct rb_node *node = st->active.rb_node;

11219

-+

11220

-+	while (node != NULL) {

11221

-+		entry = rb_entry(node, struct bfq_entity, rb_node);

11222

-+left:

11223

-+		if (!bfq_gt(entry->start, st->vtime))

11224

-+			first = entry;

11225

-+

11226

-+		BUG_ON(bfq_gt(entry->min_start, st->vtime));

11227

-+

11228

-+		if (node->rb_left != NULL) {

11229

-+			entry = rb_entry(node->rb_left,

11230

-+					 struct bfq_entity, rb_node);

11231

-+			if (!bfq_gt(entry->min_start, st->vtime)) {

11232

-+				node = node->rb_left;

11233

-+				goto left;

11234

-+			}

11235

-+		}

11236

-+		if (first != NULL)

11237

-+			break;

11238

-+		node = node->rb_right;

11239

-+	}

11240

-+

11241

-+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

11242

-+	return first;

11243

-+}

11244

-+

11245

-+/**

11246

-+ * __bfq_lookup_next_entity - return the first eligible entity in @st.

11247

-+ * @st: the service tree.

11248

-+ *

11249

-+ * Update the virtual time in @st and return the first eligible entity

11250

-+ * it contains.

11251

-+ */

11252

-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

11253

-+						   bool force)

11254

-+{

11255

-+	struct bfq_entity *entity, *new_next_active = NULL;

11256

-+

11257

-+	if (RB_EMPTY_ROOT(&st->active))

11258

-+		return NULL;

11259

-+

11260

-+	bfq_update_vtime(st);

11261

-+	entity = bfq_first_active_entity(st);

11262

-+	BUG_ON(bfq_gt(entity->start, st->vtime));

11263

-+

11264

-+	/*

11265

-+	 * If the chosen entity does not match with the sched_data's

11266

-+	 * next_active and we are forcedly serving the IDLE priority

11267

-+	 * class tree, bubble up budget update.

11268

-+	 */

11269

-+	if (unlikely(force && entity != entity->sched_data->next_active)) {

11270

-+		new_next_active = entity;

11271

-+		for_each_entity(new_next_active)

11272

-+			bfq_update_budget(new_next_active);

11273

-+	}

11274

-+

11275

-+	return entity;

11276

-+}

11277

-+

11278

-+/**

11279

-+ * bfq_lookup_next_entity - return the first eligible entity in @sd.

11280

-+ * @sd: the sched_data.

11281

-+ * @extract: if true the returned entity will be also extracted from @sd.

11282

-+ *

11283

-+ * NOTE: since we cache the next_active entity at each level of the

11284

-+ * hierarchy, the complexity of the lookup can be decreased with

11285

-+ * absolutely no effort just returning the cached next_active value;

11286

-+ * we prefer to do full lookups to test the consistency of * the data

11287

-+ * structures.

11288

-+ */

11289

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

11290

-+						 int extract,

11291

-+						 struct bfq_data *bfqd)

11292

-+{

11293

-+	struct bfq_service_tree *st = sd->service_tree;

11294

-+	struct bfq_entity *entity;

11295

-+	int i=0;

11296

-+

11297

-+	BUG_ON(sd->active_entity != NULL);

11298

-+

11299

-+	if (bfqd != NULL &&

11300

-+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

11301

-+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);

11302

-+		if (entity != NULL) {

11303

-+			i = BFQ_IOPRIO_CLASSES - 1;

11304

-+			bfqd->bfq_class_idle_last_service = jiffies;

11305

-+			sd->next_active = entity;

11306

-+		}

11307

-+	}

11308

-+	for (; i < BFQ_IOPRIO_CLASSES; i++) {

11309

-+		entity = __bfq_lookup_next_entity(st + i, false);

11310

-+		if (entity != NULL) {

11311

-+			if (extract) {

11312

-+				bfq_check_next_active(sd, entity);

11313

-+				bfq_active_extract(st + i, entity);

11314

-+				sd->active_entity = entity;

11315

-+				sd->next_active = NULL;

11316

-+			}

11317

-+			break;

11318

-+		}

11319

-+	}

11320

-+

11321

-+	return entity;

11322

-+}

11323

-+

11324

-+/*

11325

-+ * Get next queue for service.

11326

-+ */

11327

-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

11328

-+{

11329

-+	struct bfq_entity *entity = NULL;

11330

-+	struct bfq_sched_data *sd;

11331

-+	struct bfq_queue *bfqq;

11332

-+

11333

-+	BUG_ON(bfqd->active_queue != NULL);

11334

-+

11335

-+	if (bfqd->busy_queues == 0)

11336

-+		return NULL;

11337

-+

11338

-+	sd = &bfqd->root_group->sched_data;

11339

-+	for (; sd != NULL; sd = entity->my_sched_data) {

11340

-+		entity = bfq_lookup_next_entity(sd, 1, bfqd);

11341

-+		BUG_ON(entity == NULL);

11342

-+		entity->service = 0;

11343

-+	}

11344

-+

11345

-+	bfqq = bfq_entity_to_bfqq(entity);

11346

-+	BUG_ON(bfqq == NULL);

11347

-+

11348

-+	return bfqq;

11349

-+}

11350

-+

11351

-+/*

11352

-+ * Forced extraction of the given queue.

11353

-+ */

11354

-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

11355

-+				      struct bfq_queue *bfqq)

11356

-+{

11357

-+	struct bfq_entity *entity;

11358

-+	struct bfq_sched_data *sd;

11359

-+

11360

-+	BUG_ON(bfqd->active_queue != NULL);

11361

-+

11362

-+	entity = &bfqq->entity;

11363

-+	/*

11364

-+	 * Bubble up extraction/update from the leaf to the root.

11365

-+	*/

11366

-+	for_each_entity(entity) {

11367

-+		sd = entity->sched_data;

11368

-+		bfq_update_budget(entity);

11369

-+		bfq_update_vtime(bfq_entity_service_tree(entity));

11370

-+		bfq_active_extract(bfq_entity_service_tree(entity), entity);

11371

-+		sd->active_entity = entity;

11372

-+		sd->next_active = NULL;

11373

-+		entity->service = 0;

11374

-+	}

11375

-+

11376

-+	return;

11377

-+}

11378

-+

11379

-+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

11380

-+{

11381

-+	if (bfqd->active_bic != NULL) {

11382

-+		put_io_context(bfqd->active_bic->icq.ioc);

11383

-+		bfqd->active_bic = NULL;

11384

-+	}

11385

-+

11386

-+	bfqd->active_queue = NULL;

11387

-+	del_timer(&bfqd->idle_slice_timer);

11388

-+}

11389

-+

11390

-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11391

-+				int requeue)

11392

-+{

11393

-+	struct bfq_entity *entity = &bfqq->entity;

11394

-+

11395

-+	if (bfqq == bfqd->active_queue)

11396

-+		__bfq_bfqd_reset_active(bfqd);

11397

-+

11398

-+	bfq_deactivate_entity(entity, requeue);

11399

-+}

11400

-+

11401

-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11402

-+{

11403

-+	struct bfq_entity *entity = &bfqq->entity;

11404

-+

11405

-+	bfq_activate_entity(entity);

11406

-+}

11407

-+

11408

-+/*

11409

-+ * Called when the bfqq no longer has requests pending, remove it from

11410

-+ * the service tree.

11411

-+ */

11412

-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11413

-+			      int requeue)

11414

-+{

11415

-+	BUG_ON(!bfq_bfqq_busy(bfqq));

11416

-+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

11417

-+

11418

-+	bfq_log_bfqq(bfqd, bfqq, "del from busy");

11419

-+

11420

-+	bfq_clear_bfqq_busy(bfqq);

11421

-+

11422

-+	BUG_ON(bfqd->busy_queues == 0);

11423

-+	bfqd->busy_queues--;

11424

-+

11425

-+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

11426

-+}

11427

-+

11428

-+/*

11429

-+ * Called when an inactive queue receives a new request.

11430

-+ */

11431

-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11432

-+{

11433

-+	BUG_ON(bfq_bfqq_busy(bfqq));

11434

-+	BUG_ON(bfqq == bfqd->active_queue);

11435

-+

11436

-+	bfq_log_bfqq(bfqd, bfqq, "add to busy");

11437

-+

11438

-+	bfq_activate_bfqq(bfqd, bfqq);

11439

-+

11440

-+	bfq_mark_bfqq_busy(bfqq);

11441

-+	bfqd->busy_queues++;

11442

-+}

11443

-diff --git a/block/bfq.h b/block/bfq.h

11444

-new file mode 100644

11445

-index 0000000..48ecde9

11446

---- /dev/null

11447

-+++ b/block/bfq.h

11448

-@@ -0,0 +1,603 @@

11449

-+/*

11450

-+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.

11451

-+ *

11452

-+ * Based on ideas and code from CFQ:

11453

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

11454

-+ *

11455

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

11456

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

11457

-+ *

11458

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

11459

-+ */

11460

-+

11461

-+#ifndef _BFQ_H

11462

-+#define _BFQ_H

11463

-+

11464

-+#include <linux/blktrace_api.h>

11465

-+#include <linux/hrtimer.h>

11466

-+#include <linux/ioprio.h>

11467

-+#include <linux/rbtree.h>

11468

-+

11469

-+#define BFQ_IOPRIO_CLASSES	3

11470

-+#define BFQ_CL_IDLE_TIMEOUT	HZ/5

11471

-+

11472

-+#define BFQ_MIN_WEIGHT	1

11473

-+#define BFQ_MAX_WEIGHT	1000

11474

-+

11475

-+#define BFQ_DEFAULT_GRP_WEIGHT	10

11476

-+#define BFQ_DEFAULT_GRP_IOPRIO	0

11477

-+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

11478

-+

11479

-+struct bfq_entity;

11480

-+

11481

-+/**

11482

-+ * struct bfq_service_tree - per ioprio_class service tree.

11483

-+ * @active: tree for active entities (i.e., those backlogged).

11484

-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

11485

-+ * @first_idle: idle entity with minimum F_i.

11486

-+ * @last_idle: idle entity with maximum F_i.

11487

-+ * @vtime: scheduler virtual time.

11488

-+ * @wsum: scheduler weight sum; active and idle entities contribute to it.

11489

-+ *

11490

-+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

11491

-+ * ioprio_class has its own independent scheduler, and so its own

11492

-+ * bfq_service_tree.  All the fields are protected by the queue lock

11493

-+ * of the containing bfqd.

11494

-+ */

11495

-+struct bfq_service_tree {

11496

-+	struct rb_root active;

11497

-+	struct rb_root idle;

11498

-+

11499

-+	struct bfq_entity *first_idle;

11500

-+	struct bfq_entity *last_idle;

11501

-+

11502

-+	u64 vtime;

11503

-+	unsigned long wsum;

11504

-+};

11505

-+

11506

-+/**

11507

-+ * struct bfq_sched_data - multi-class scheduler.

11508

-+ * @active_entity: entity under service.

11509

-+ * @next_active: head-of-the-line entity in the scheduler.

11510

-+ * @service_tree: array of service trees, one per ioprio_class.

11511

-+ *

11512

-+ * bfq_sched_data is the basic scheduler queue.  It supports three

11513

-+ * ioprio_classes, and can be used either as a toplevel queue or as

11514

-+ * an intermediate queue on a hierarchical setup.

11515

-+ * @next_active points to the active entity of the sched_data service

11516

-+ * trees that will be scheduled next.

11517

-+ *

11518

-+ * The supported ioprio_classes are the same as in CFQ, in descending

11519

-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

11520

-+ * Requests from higher priority queues are served before all the

11521

-+ * requests from lower priority queues; among requests of the same

11522

-+ * queue requests are served according to B-WF2Q+.

11523

-+ * All the fields are protected by the queue lock of the containing bfqd.

11524

-+ */

11525

-+struct bfq_sched_data {

11526

-+	struct bfq_entity *active_entity;

11527

-+	struct bfq_entity *next_active;

11528

-+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

11529

-+};

11530

-+

11531

-+/**

11532

-+ * struct bfq_entity - schedulable entity.

11533

-+ * @rb_node: service_tree member.

11534

-+ * @on_st: flag, true if the entity is on a tree (either the active or

11535

-+ *         the idle one of its service_tree).

11536

-+ * @finish: B-WF2Q+ finish timestamp (aka F_i).

11537

-+ * @start: B-WF2Q+ start timestamp (aka S_i).

11538

-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

11539

-+ * @min_start: minimum start time of the (active) subtree rooted at

11540

-+ *             this entity; used for O(log N) lookups into active trees.

11541

-+ * @service: service received during the last round of service.

11542

-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

11543

-+ * @weight: weight of the queue

11544

-+ * @parent: parent entity, for hierarchical scheduling.

11545

-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

11546

-+ *                 associated scheduler queue, %NULL on leaf nodes.

11547

-+ * @sched_data: the scheduler queue this entity belongs to.

11548

-+ * @ioprio: the ioprio in use.

11549

-+ * @new_weight: when a weight change is requested, the new weight value.

11550

-+ * @orig_weight: original weight, used to implement weight boosting

11551

-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

11552

-+ * @ioprio_class: the ioprio_class in use.

11553

-+ * @new_ioprio_class: when an ioprio_class change is requested, the new

11554

-+ *                    ioprio_class value.

11555

-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

11556

-+ *                  ioprio_class change.

11557

-+ *

11558

-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

11559

-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

11560

-+ * entity belongs to the sched_data of the parent group in the cgroup

11561

-+ * hierarchy.  Non-leaf entities have also their own sched_data, stored

11562

-+ * in @my_sched_data.

11563

-+ *

11564

-+ * Each entity stores independently its priority values; this would

11565

-+ * allow different weights on different devices, but this

11566

-+ * functionality is not exported to userspace by now.  Priorities and

11567

-+ * weights are updated lazily, first storing the new values into the

11568

-+ * new_* fields, then setting the @ioprio_changed flag.  As soon as

11569

-+ * there is a transition in the entity state that allows the priority

11570

-+ * update to take place the effective and the requested priority

11571

-+ * values are synchronized.

11572

-+ *

11573

-+ * Unless cgroups are used, the weight value is calculated from the

11574

-+ * ioprio to export the same interface as CFQ.  When dealing with

11575

-+ * ``well-behaved'' queues (i.e., queues that do not spend too much

11576

-+ * time to consume their budget and have true sequential behavior, and

11577

-+ * when there are no external factors breaking anticipation) the

11578

-+ * relative weights at each level of the cgroups hierarchy should be

11579

-+ * guaranteed.  All the fields are protected by the queue lock of the

11580

-+ * containing bfqd.

11581

-+ */

11582

-+struct bfq_entity {

11583

-+	struct rb_node rb_node;

11584

-+

11585

-+	int on_st;

11586

-+

11587

-+	u64 finish;

11588

-+	u64 start;

11589

-+

11590

-+	struct rb_root *tree;

11591

-+

11592

-+	u64 min_start;

11593

-+

11594

-+	unsigned long service, budget;

11595

-+	unsigned short weight, new_weight;

11596

-+	unsigned short orig_weight;

11597

-+

11598

-+	struct bfq_entity *parent;

11599

-+

11600

-+	struct bfq_sched_data *my_sched_data;

11601

-+	struct bfq_sched_data *sched_data;

11602

-+

11603

-+	unsigned short ioprio, new_ioprio;

11604

-+	unsigned short ioprio_class, new_ioprio_class;

11605

-+

11606

-+	int ioprio_changed;

11607

-+};

11608

-+

11609

-+struct bfq_group;

11610

-+

11611

-+/**

11612

-+ * struct bfq_queue - leaf schedulable entity.

11613

-+ * @ref: reference counter.

11614

-+ * @bfqd: parent bfq_data.

11615

-+ * @new_bfqq: shared bfq_queue if queue is cooperating with

11616

-+ *           one or more other queues.

11617

-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

11618

-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

11619

-+ * @sort_list: sorted list of pending requests.

11620

-+ * @next_rq: if fifo isn't expired, next request to serve.

11621

-+ * @queued: nr of requests queued in @sort_list.

11622

-+ * @allocated: currently allocated requests.

11623

-+ * @meta_pending: pending metadata requests.

11624

-+ * @fifo: fifo list of requests in sort_list.

11625

-+ * @entity: entity representing this queue in the scheduler.

11626

-+ * @max_budget: maximum budget allowed from the feedback mechanism.

11627

-+ * @budget_timeout: budget expiration (in jiffies).

11628

-+ * @dispatched: number of requests on the dispatch list or inside driver.

11629

-+ * @org_ioprio: saved ioprio during boosted periods.

11630

-+ * @flags: status flags.

11631

-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

11632

-+ * @seek_samples: number of seeks sampled

11633

-+ * @seek_total: sum of the distances of the seeks sampled

11634

-+ * @seek_mean: mean seek distance

11635

-+ * @last_request_pos: position of the last request enqueued

11636

-+ * @pid: pid of the process owning the queue, used for logging purposes.

11637

-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

11638

-+ * @raising_cur_max_time: current max raising time for this queue

11639

-+ *

11640

-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context

11641

-+ * or more (if it is an async one).  @cgroup holds a reference to the

11642

-+ * cgroup, to be sure that it does not disappear while a bfqq still

11643

-+ * references it (mostly to avoid races between request issuing and task

11644

-+ * migration followed by cgroup distruction).

11645

-+ * All the fields are protected by the queue lock of the containing bfqd.

11646

-+ */

11647

-+struct bfq_queue {

11648

-+	atomic_t ref;

11649

-+	struct bfq_data *bfqd;

11650

-+

11651

-+	/* fields for cooperating queues handling */

11652

-+	struct bfq_queue *new_bfqq;

11653

-+	struct rb_node pos_node;

11654

-+	struct rb_root *pos_root;

11655

-+

11656

-+	struct rb_root sort_list;

11657

-+	struct request *next_rq;

11658

-+	int queued[2];

11659

-+	int allocated[2];

11660

-+	int meta_pending;

11661

-+	struct list_head fifo;

11662

-+

11663

-+	struct bfq_entity entity;

11664

-+

11665

-+	unsigned long max_budget;

11666

-+	unsigned long budget_timeout;

11667

-+

11668

-+	int dispatched;

11669

-+

11670

-+	unsigned short org_ioprio;

11671

-+

11672

-+	unsigned int flags;

11673

-+

11674

-+	struct list_head bfqq_list;

11675

-+

11676

-+	unsigned int seek_samples;

11677

-+	u64 seek_total;

11678

-+	sector_t seek_mean;

11679

-+	sector_t last_request_pos;

11680

-+

11681

-+	pid_t pid;

11682

-+

11683

-+	/* weight-raising fields */

11684

-+	unsigned int raising_cur_max_time;

11685

-+	u64 last_rais_start_finish, soft_rt_next_start;

11686

-+	unsigned int raising_coeff;

11687

-+};

11688

-+

11689

-+/**

11690

-+ * struct bfq_ttime - per process thinktime stats.

11691

-+ * @ttime_total: total process thinktime

11692

-+ * @ttime_samples: number of thinktime samples

11693

-+ * @ttime_mean: average process thinktime

11694

-+ */

11695

-+struct bfq_ttime {

11696

-+	unsigned long last_end_request;

11697

-+

11698

-+	unsigned long ttime_total;

11699

-+	unsigned long ttime_samples;

11700

-+	unsigned long ttime_mean;

11701

-+};

11702

-+

11703

-+/**

11704

-+ * struct bfq_io_cq - per (request_queue, io_context) structure.

11705

-+ * @icq: associated io_cq structure

11706

-+ * @bfqq: array of two process queues, the sync and the async

11707

-+ * @ttime: associated @bfq_ttime struct

11708

-+ */

11709

-+struct bfq_io_cq {

11710

-+	struct io_cq icq; /* must be the first member */

11711

-+	struct bfq_queue *bfqq[2];

11712

-+	struct bfq_ttime ttime;

11713

-+	int ioprio;

11714

-+};

11715

-+

11716

-+/**

11717

-+ * struct bfq_data - per device data structure.

11718

-+ * @queue: request queue for the managed device.

11719

-+ * @root_group: root bfq_group for the device.

11720

-+ * @rq_pos_tree: rbtree sorted by next_request position,

11721

-+ *		used when determining if two or more queues

11722

-+ *		have interleaving requests (see bfq_close_cooperator).

11723

-+ * @busy_queues: number of bfq_queues containing requests (including the

11724

-+ *		 queue under service, even if it is idling).

11725

-+ * @queued: number of queued requests.

11726

-+ * @rq_in_driver: number of requests dispatched and waiting for completion.

11727

-+ * @sync_flight: number of sync requests in the driver.

11728

-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

11729

-+ *		      completed requests .

11730

-+ * @hw_tag_samples: nr of samples used to calculate hw_tag.

11731

-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

11732

-+ * @budgets_assigned: number of budgets assigned.

11733

-+ * @idle_slice_timer: timer set when idling for the next sequential request

11734

-+ *                    from the queue under service.

11735

-+ * @unplug_work: delayed work to restart dispatching on the request queue.

11736

-+ * @active_queue: bfq_queue under service.

11737

-+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.

11738

-+ * @last_position: on-disk position of the last served request.

11739

-+ * @last_budget_start: beginning of the last budget.

11740

-+ * @last_idling_start: beginning of the last idle slice.

11741

-+ * @peak_rate: peak transfer rate observed for a budget.

11742

-+ * @peak_rate_samples: number of samples used to calculate @peak_rate.

11743

-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

11744

-+ * @group_list: list of all the bfq_groups active on the device.

11745

-+ * @active_list: list of all the bfq_queues active on the device.

11746

-+ * @idle_list: list of all the bfq_queues idle on the device.

11747

-+ * @bfq_quantum: max number of requests dispatched per dispatch round.

11748

-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

11749

-+ *                   requests are served in fifo order.

11750

-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

11751

-+ * @bfq_back_max: maximum allowed backward seek.

11752

-+ * @bfq_slice_idle: maximum idling time.

11753

-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

11754

-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

11755

-+ *                           async queues.

11756

-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

11757

-+ *               to prevent seeky queues to impose long latencies to well

11758

-+ *               behaved ones (this also implies that seeky queues cannot

11759

-+ *               receive guarantees in the service domain; after a timeout

11760

-+ *               they are charged for the whole allocated budget, to try

11761

-+ *               to preserve a behavior reasonably fair among them, but

11762

-+ *               without service-domain guarantees).

11763

-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

11764

-+ *                            queue is multiplied

11765

-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

11766

-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

11767

-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

11768

-+ *			       may be reactivated for a queue (in jiffies)

11769

-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

11770

-+ *				     after which weight-raising may be

11771

-+ *				     reactivated for an already busy queue

11772

-+ *				     (in jiffies)

11773

-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

11774

-+ *			         sectors per seconds

11775

-+ * @RT_prod: cached value of the product R*T used for computing the maximum

11776

-+ * 	     duration of the weight raising automatically

11777

-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

11778

-+ *

11779

-+ * All the fields are protected by the @queue lock.

11780

-+ */

11781

-+struct bfq_data {

11782

-+	struct request_queue *queue;

11783

-+

11784

-+	struct bfq_group *root_group;

11785

-+

11786

-+	struct rb_root rq_pos_tree;

11787

-+

11788

-+	int busy_queues;

11789

-+	int queued;

11790

-+	int rq_in_driver;

11791

-+	int sync_flight;

11792

-+

11793

-+	int max_rq_in_driver;

11794

-+	int hw_tag_samples;

11795

-+	int hw_tag;

11796

-+

11797

-+	int budgets_assigned;

11798

-+

11799

-+	struct timer_list idle_slice_timer;

11800

-+	struct work_struct unplug_work;

11801

-+

11802

-+	struct bfq_queue *active_queue;

11803

-+	struct bfq_io_cq *active_bic;

11804

-+

11805

-+	sector_t last_position;

11806

-+

11807

-+	ktime_t last_budget_start;

11808

-+	ktime_t last_idling_start;

11809

-+	int peak_rate_samples;

11810

-+	u64 peak_rate;

11811

-+	unsigned long bfq_max_budget;

11812

-+

11813

-+	struct hlist_head group_list;

11814

-+	struct list_head active_list;

11815

-+	struct list_head idle_list;

11816

-+

11817

-+	unsigned int bfq_quantum;

11818

-+	unsigned int bfq_fifo_expire[2];

11819

-+	unsigned int bfq_back_penalty;

11820

-+	unsigned int bfq_back_max;

11821

-+	unsigned int bfq_slice_idle;

11822

-+	u64 bfq_class_idle_last_service;

11823

-+

11824

-+	unsigned int bfq_user_max_budget;

11825

-+	unsigned int bfq_max_budget_async_rq;

11826

-+	unsigned int bfq_timeout[2];

11827

-+

11828

-+	bool low_latency;

11829

-+

11830

-+	/* parameters of the low_latency heuristics */

11831

-+	unsigned int bfq_raising_coeff;

11832

-+	unsigned int bfq_raising_max_time;

11833

-+	unsigned int bfq_raising_rt_max_time;

11834

-+	unsigned int bfq_raising_min_idle_time;

11835

-+	unsigned int bfq_raising_min_inter_arr_async;

11836

-+	unsigned int bfq_raising_max_softrt_rate;

11837

-+	u64 RT_prod;

11838

-+

11839

-+	struct bfq_queue oom_bfqq;

11840

-+};

11841

-+

11842

-+enum bfqq_state_flags {

11843

-+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

11844

-+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

11845

-+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

11846

-+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

11847

-+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

11848

-+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

11849

-+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

11850

-+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

11851

-+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

11852

-+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

11853

-+	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

11854

-+};

11855

-+

11856

-+#define BFQ_BFQQ_FNS(name)						\

11857

-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

11858

-+{									\

11859

-+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

11860

-+}									\

11861

-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

11862

-+{									\

11863

-+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

11864

-+}									\

11865

-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

11866

-+{									\

11867

-+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

11868

-+}

11869

-+

11870

-+BFQ_BFQQ_FNS(busy);

11871

-+BFQ_BFQQ_FNS(wait_request);

11872

-+BFQ_BFQQ_FNS(must_alloc);

11873

-+BFQ_BFQQ_FNS(fifo_expire);

11874

-+BFQ_BFQQ_FNS(idle_window);

11875

-+BFQ_BFQQ_FNS(prio_changed);

11876

-+BFQ_BFQQ_FNS(sync);

11877

-+BFQ_BFQQ_FNS(budget_new);

11878

-+BFQ_BFQQ_FNS(coop);

11879

-+BFQ_BFQQ_FNS(split_coop);

11880

-+BFQ_BFQQ_FNS(some_coop_idle);

11881

-+#undef BFQ_BFQQ_FNS

11882

-+

11883

-+/* Logging facilities. */

11884

-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

11885

-+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

11886

-+

11887

-+#define bfq_log(bfqd, fmt, args...) \

11888

-+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

11889

-+

11890

-+/* Expiration reasons. */

11891

-+enum bfqq_expiration {

11892

-+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

11893

-+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

11894

-+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

11895

-+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

11896

-+};

11897

-+

11898

-+#ifdef CONFIG_CGROUP_BFQIO

11899

-+/**

11900

-+ * struct bfq_group - per (device, cgroup) data structure.

11901

-+ * @entity: schedulable entity to insert into the parent group sched_data.

11902

-+ * @sched_data: own sched_data, to contain child entities (they may be

11903

-+ *              both bfq_queues and bfq_groups).

11904

-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data

11905

-+ *              list of the containing cgroup's bfqio_cgroup.

11906

-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list

11907

-+ *             of the groups active on the same device; used for cleanup.

11908

-+ * @bfqd: the bfq_data for the device this group acts upon.

11909

-+ * @async_bfqq: array of async queues for all the tasks belonging to

11910

-+ *              the group, one queue per ioprio value per ioprio_class,

11911

-+ *              except for the idle class that has only one queue.

11912

-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

11913

-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

11914

-+ *             to avoid too many special cases during group creation/migration.

11915

-+ *

11916

-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

11917

-+ * there is a set of bfq_groups, each one collecting the lower-level

11918

-+ * entities belonging to the group that are acting on the same device.

11919

-+ *

11920

-+ * Locking works as follows:

11921

-+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

11922

-+ *      via RCU from its readers.

11923

-+ *    o @bfqd is protected by the queue lock, RCU is used to access it

11924

-+ *      from the readers.

11925

-+ *    o All the other fields are protected by the @bfqd queue lock.

11926

-+ */

11927

-+struct bfq_group {

11928

-+	struct bfq_entity entity;

11929

-+	struct bfq_sched_data sched_data;

11930

-+

11931

-+	struct hlist_node group_node;

11932

-+	struct hlist_node bfqd_node;

11933

-+

11934

-+	void *bfqd;

11935

-+

11936

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

11937

-+	struct bfq_queue *async_idle_bfqq;

11938

-+

11939

-+	struct bfq_entity *my_entity;

11940

-+};

11941

-+

11942

-+/**

11943

-+ * struct bfqio_cgroup - bfq cgroup data structure.

11944

-+ * @css: subsystem state for bfq in the containing cgroup.

11945

-+ * @weight: cgroup weight.

11946

-+ * @ioprio: cgroup ioprio.

11947

-+ * @ioprio_class: cgroup ioprio_class.

11948

-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

11949

-+ * @group_data: list containing the bfq_group belonging to this cgroup.

11950

-+ *

11951

-+ * @group_data is accessed using RCU, with @lock protecting the updates,

11952

-+ * @ioprio and @ioprio_class are protected by @lock.

11953

-+ */

11954

-+struct bfqio_cgroup {

11955

-+	struct cgroup_subsys_state css;

11956

-+

11957

-+	unsigned short weight, ioprio, ioprio_class;

11958

-+

11959

-+	spinlock_t lock;

11960

-+	struct hlist_head group_data;

11961

-+};

11962

-+#else

11963

-+struct bfq_group {

11964

-+	struct bfq_sched_data sched_data;

11965

-+

11966

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

11967

-+	struct bfq_queue *async_idle_bfqq;

11968

-+};

11969

-+#endif

11970

-+

11971

-+static inline struct bfq_service_tree *

11972

-+bfq_entity_service_tree(struct bfq_entity *entity)

11973

-+{

11974

-+	struct bfq_sched_data *sched_data = entity->sched_data;

11975

-+	unsigned int idx = entity->ioprio_class - 1;

11976

-+

11977

-+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

11978

-+	BUG_ON(sched_data == NULL);

11979

-+

11980

-+	return sched_data->service_tree + idx;

11981

-+}

11982

-+

11983

-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

11984

-+					    int is_sync)

11985

-+{

11986

-+	return bic->bfqq[!!is_sync];

11987

-+}

11988

-+

11989

-+static inline void bic_set_bfqq(struct bfq_io_cq *bic,

11990

-+				struct bfq_queue *bfqq, int is_sync)

11991

-+{

11992

-+	bic->bfqq[!!is_sync] = bfqq;

11993

-+}

11994

-+

11995

-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

11996

-+{

11997

-+	return bic->icq.q->elevator->elevator_data;

11998

-+}

11999

-+

12000

-+/**

12001

-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

12002

-+ * @ptr: a pointer to a bfqd.

12003

-+ * @flags: storage for the flags to be saved.

12004

-+ *

12005

-+ * This function allows bfqg->bfqd to be protected by the

12006

-+ * queue lock of the bfqd they reference; the pointer is dereferenced

12007

-+ * under RCU, so the storage for bfqd is assured to be safe as long

12008

-+ * as the RCU read side critical section does not end.  After the

12009

-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

12010

-+ * sure that no other writer accessed it.  If we raced with a writer,

12011

-+ * the function returns NULL, with the queue unlocked, otherwise it

12012

-+ * returns the dereferenced pointer, with the queue locked.

12013

-+ */

12014

-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

12015

-+						   unsigned long *flags)

12016

-+{

12017

-+	struct bfq_data *bfqd;

12018

-+

12019

-+	rcu_read_lock();

12020

-+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

12021

-+

12022

-+	if (bfqd != NULL) {

12023

-+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

12024

-+		if (*ptr == bfqd)

12025

-+			goto out;

12026

-+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12027

-+	}

12028

-+

12029

-+	bfqd = NULL;

12030

-+out:

12031

-+	rcu_read_unlock();

12032

-+	return bfqd;

12033

-+}

12034

-+

12035

-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

12036

-+				       unsigned long *flags)

12037

-+{

12038

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12039

-+}

12040

-+

12041

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic);

12042

-+static void bfq_put_queue(struct bfq_queue *bfqq);

12043

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

12044

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

12045

-+				       struct bfq_group *bfqg, int is_sync,

12046

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask);

12047

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

12048

-+					 struct bfq_group *bfqg);

12049

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

12050

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

12051

-+#endif

12052

---

12053

-1.8.1.4

12054

-

12055

12056

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

12057

===================================================================

12058

--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	2014-01-29 14:41:45 UTC (rev 2660)

12059

+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	2014-01-30 16:49:47 UTC (rev 2661)

12060

@@ -1,1049 +0,0 @@

12061

-From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001

12062

-From: Mauro Andreolini <mauro.andreolini@×××××××.it>

12063

-Date: Fri, 14 Jun 2013 13:46:47 +0200

12064

-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for

12065

- 3.11.0

12066

-

12067

-A set of processes may happen  to  perform interleaved reads, i.e., requests

12068

-whose union would give rise to a  sequential read  pattern.  There   are two

12069

-typical  cases: in the first  case,   processes  read  fixed-size  chunks of

12070

-data at a fixed distance from each other, while in the second case processes

12071

-may read variable-size chunks at  variable distances. The latter case occurs

12072

-for  example with  KVM, which  splits the  I/O generated  by the  guest into

12073

-multiple chunks,  and lets these chunks  be served by a  pool of cooperating

12074

-processes,  iteratively  assigning  the  next  chunk of  I/O  to  the  first

12075

-available  process. CFQ  uses actual  queue merging  for the  first type  of

12076

-processes, whereas it  uses preemption to get a sequential  read pattern out

12077

-of the read requests  performed by the second type of  processes. In the end

12078

-it uses  two different  mechanisms to  achieve the  same goal:  boosting the

12079

-throughput with interleaved I/O.

12080

-

12081

-This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

12082

-sequential  read pattern  with both  types of  processes. The  main idea  is

12083

-checking newly arrived requests against the next request of the active queue

12084

-both in case of actual request insert and in case of request merge. By doing

12085

-so, both the types of processes can be handled by just merging their queues.

12086

-EQM is  then simpler and  more compact than the  pair of mechanisms  used in

12087

-CFQ.

12088

-

12089

-Finally, EQM  also preserves the  typical low-latency properties of  BFQ, by

12090

-properly restoring the weight-raising state of  a queue when it gets back to

12091

-a non-merged state.

12092

-

12093

-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

12094

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

12095

-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

12096

----

12097

- block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------

12098

- block/bfq-sched.c   |  28 ---

12099

- block/bfq.h         |  16 ++

12100

- 3 files changed, 466 insertions(+), 231 deletions(-)

12101

-

12102

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

12103

-index 0ed2746..bbe79fb 100644

12104

---- a/block/bfq-iosched.c

12105

-+++ b/block/bfq-iosched.c

12106

-@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

12107

- 	return dur;

12108

- }

12109

-

12110

-+static inline void

12111

-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

12112

-+{

12113

-+	if (bic->saved_idle_window)

12114

-+		bfq_mark_bfqq_idle_window(bfqq);

12115

-+	else

12116

-+		bfq_clear_bfqq_idle_window(bfqq);

12117

-+	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

12118

-+		/*

12119

-+		 * Start a weight raising period with the duration given by

12120

-+		 * the raising_time_left snapshot.

12121

-+		 */

12122

-+		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

12123

-+		bfqq->raising_cur_max_time = bic->raising_time_left;

12124

-+		bfqq->last_rais_start_finish = jiffies;

12125

-+	}

12126

-+	/*

12127

-+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

12128

-+	 * getting confused about the queue's need of a weight-raising

12129

-+	 * period.

12130

-+	 */

12131

-+	bic->raising_time_left = 0;

12132

-+}

12133

-+

12134

-+/*

12135

-+ * Must be called with the queue_lock held.

12136

-+ */

12137

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

12138

-+{

12139

-+	int process_refs, io_refs;

12140

-+

12141

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12142

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12143

-+	BUG_ON(process_refs < 0);

12144

-+	return process_refs;

12145

-+}

12146

-+

12147

- static void bfq_add_rq_rb(struct request *rq)

12148

- {

12149

- 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

12150

-@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)

12151

- 		if (! bfqd->low_latency)

12152

- 			goto add_bfqq_busy;

12153

-

12154

-+		if (bfq_bfqq_just_split(bfqq))

12155

-+			goto set_ioprio_changed;

12156

-+

12157

- 		/*

12158

--		 * If the queue is not being boosted and has been idle

12159

--		 * for enough time, start a weight-raising period

12160

-+		 * If the queue:

12161

-+		 * - is not being boosted,

12162

-+		 * - has been idle for enough time,

12163

-+		 * - is not a sync queue or is linked to a bfq_io_cq (it is

12164

-+		 *   shared "for its nature" or it is not shared and its

12165

-+		 *   requests have not been redirected to a shared queue)

12166

-+		 * start a weight-raising period.

12167

- 		 */

12168

--		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

12169

-+		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

12170

-+		   (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

12171

- 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

12172

- 			if (idle_for_long_time)

12173

- 				bfqq->raising_cur_max_time =

12174

-@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)

12175

- 						raising_cur_max_time));

12176

- 				}

12177

- 		}

12178

-+set_ioprio_changed:

12179

- 		if (old_raising_coeff != bfqq->raising_coeff)

12180

- 			entity->ioprio_changed = 1;

12181

- add_bfqq_busy:

12182

-@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

12183

- 	spin_unlock_irq(bfqd->queue->queue_lock);

12184

- }

12185

-

12186

--static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12187

--			   struct bio *bio)

12188

-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

12189

- {

12190

--	struct bfq_data *bfqd = q->elevator->elevator_data;

12191

--	struct bfq_io_cq *bic;

12192

--	struct bfq_queue *bfqq;

12193

--

12194

--	/*

12195

--	 * Disallow merge of a sync bio into an async request.

12196

--	 */

12197

--	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12198

--		return 0;

12199

--

12200

--	/*

12201

--	 * Lookup the bfqq that this bio will be queued with. Allow

12202

--	 * merge only if rq is queued there.

12203

--	 * Queue lock is held here.

12204

--	 */

12205

--	bic = bfq_bic_lookup(bfqd, current->io_context);

12206

--	if (bic == NULL)

12207

--		return 0;

12208

--

12209

--	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12210

--	return bfqq == RQ_BFQQ(rq);

12211

--}

12212

--

12213

--static void __bfq_set_active_queue(struct bfq_data *bfqd,

12214

--				   struct bfq_queue *bfqq)

12215

--{

12216

--	if (bfqq != NULL) {

12217

--		bfq_mark_bfqq_must_alloc(bfqq);

12218

--		bfq_mark_bfqq_budget_new(bfqq);

12219

--		bfq_clear_bfqq_fifo_expire(bfqq);

12220

--

12221

--		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12222

--

12223

--		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

12224

--			     bfqq->entity.budget);

12225

--	}

12226

--

12227

--	bfqd->active_queue = bfqq;

12228

--}

12229

--

12230

--/*

12231

-- * Get and set a new active queue for service.

12232

-- */

12233

--static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

12234

--					      struct bfq_queue *bfqq)

12235

--{

12236

--	if (!bfqq)

12237

--		bfqq = bfq_get_next_queue(bfqd);

12238

-+	if (request)

12239

-+		return blk_rq_pos(io_struct);

12240

- 	else

12241

--		bfq_get_next_queue_forced(bfqd, bfqq);

12242

--

12243

--	__bfq_set_active_queue(bfqd, bfqq);

12244

--	return bfqq;

12245

-+		return ((struct bio *)io_struct)->bi_sector;

12246

- }

12247

-

12248

--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

12249

--					  struct request *rq)

12250

-+static inline sector_t bfq_dist_from(sector_t pos1,

12251

-+				     sector_t pos2)

12252

- {

12253

--	if (blk_rq_pos(rq) >= bfqd->last_position)

12254

--		return blk_rq_pos(rq) - bfqd->last_position;

12255

-+	if (pos1 >= pos2)

12256

-+		return pos1 - pos2;

12257

- 	else

12258

--		return bfqd->last_position - blk_rq_pos(rq);

12259

-+		return pos2 - pos1;

12260

- }

12261

-

12262

--/*

12263

-- * Return true if bfqq has no request pending and rq is close enough to

12264

-- * bfqd->last_position, or if rq is closer to bfqd->last_position than

12265

-- * bfqq->next_rq

12266

-- */

12267

--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

12268

-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

12269

-+					 sector_t sector)

12270

- {

12271

--	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

12272

-+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

12273

-+	       BFQQ_SEEK_THR;

12274

- }

12275

-

12276

--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12277

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

12278

- {

12279

- 	struct rb_root *root = &bfqd->rq_pos_tree;

12280

- 	struct rb_node *parent, *node;

12281

- 	struct bfq_queue *__bfqq;

12282

--	sector_t sector = bfqd->last_position;

12283

-

12284

- 	if (RB_EMPTY_ROOT(root))

12285

- 		return NULL;

12286

-@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12287

- 	 * position).

12288

- 	 */

12289

- 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

12290

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12291

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12292

- 		return __bfqq;

12293

-

12294

- 	if (blk_rq_pos(__bfqq->next_rq) < sector)

12295

-@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12296

- 		return NULL;

12297

-

12298

- 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

12299

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12300

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12301

- 		return __bfqq;

12302

-

12303

- 	return NULL;

12304

-@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12305

- /*

12306

-  * bfqd - obvious

12307

-  * cur_bfqq - passed in so that we don't decide that the current queue

12308

-- *            is closely cooperating with itself.

12309

-- *

12310

-- * We are assuming that cur_bfqq has dispatched at least one request,

12311

-- * and that bfqd->last_position reflects a position on the disk associated

12312

-- * with the I/O issued by cur_bfqq.

12313

-+ *            is closely cooperating with itself

12314

-+ * sector - used as a reference point to search for a close queue

12315

-  */

12316

- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12317

--					      struct bfq_queue *cur_bfqq)

12318

-+					      struct bfq_queue *cur_bfqq,

12319

-+					      sector_t sector)

12320

- {

12321

- 	struct bfq_queue *bfqq;

12322

-

12323

-@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12324

- 	 * working closely on the same area of the disk. In that case,

12325

- 	 * we can group them together and don't waste time idling.

12326

- 	 */

12327

--	bfqq = bfqq_close(bfqd);

12328

-+	bfqq = bfqq_close(bfqd, sector);

12329

- 	if (bfqq == NULL || bfqq == cur_bfqq)

12330

- 		return NULL;

12331

-

12332

-@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12333

- 	return bfqq;

12334

- }

12335

-

12336

-+static struct bfq_queue *

12337

-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12338

-+{

12339

-+	int process_refs, new_process_refs;

12340

-+	struct bfq_queue *__bfqq;

12341

-+

12342

-+	/*

12343

-+	 * If there are no process references on the new_bfqq, then it is

12344

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12345

-+	 * may have dropped their last reference (not just their last process

12346

-+	 * reference).

12347

-+	 */

12348

-+	if (!bfqq_process_refs(new_bfqq))

12349

-+		return NULL;

12350

-+

12351

-+	/* Avoid a circular list and skip interim queue merges. */

12352

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

12353

-+		if (__bfqq == bfqq)

12354

-+			return NULL;

12355

-+		new_bfqq = __bfqq;

12356

-+	}

12357

-+

12358

-+	process_refs = bfqq_process_refs(bfqq);

12359

-+	new_process_refs = bfqq_process_refs(new_bfqq);

12360

-+	/*

12361

-+	 * If the process for the bfqq has gone away, there is no

12362

-+	 * sense in merging the queues.

12363

-+	 */

12364

-+	if (process_refs == 0 || new_process_refs == 0)

12365

-+		return NULL;

12366

-+

12367

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12368

-+		new_bfqq->pid);

12369

-+

12370

-+	/*

12371

-+	 * Merging is just a redirection: the requests of the process owning

12372

-+	 * one of the two queues are redirected to the other queue. The latter

12373

-+	 * queue, in its turn, is set as shared if this is the first time that

12374

-+	 * the requests of some process are redirected to it.

12375

-+	 *

12376

-+	 * We redirect bfqq to new_bfqq and not the opposite, because we

12377

-+	 * are in the context of the process owning bfqq, hence we have the

12378

-+	 * io_cq of this process. So we can immediately configure this io_cq

12379

-+	 * to redirect the requests of the process to new_bfqq.

12380

-+	 *

12381

-+	 * NOTE, even if new_bfqq coincides with the active queue, the io_cq of

12382

-+	 * new_bfqq is not available, because, if the active queue is shared,

12383

-+	 * bfqd->active_bic may not point to the io_cq of the active queue.

12384

-+	 * Redirecting the requests of the process owning bfqq to the currently

12385

-+	 * active queue is in any case the best option, as we feed the active queue

12386

-+	 * with new requests close to the last request served and, by doing so,

12387

-+	 * hopefully increase the throughput.

12388

-+	 */

12389

-+	bfqq->new_bfqq = new_bfqq;

12390

-+	atomic_add(process_refs, &new_bfqq->ref);

12391

-+	return new_bfqq;

12392

-+}

12393

-+

12394

-+/*

12395

-+ * Attempt to schedule a merge of bfqq with the currently active queue or

12396

-+ * with a close queue among the scheduled queues.

12397

-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

12398

-+ * structure otherwise.

12399

-+ */

12400

-+static struct bfq_queue *

12401

-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12402

-+		     void *io_struct, bool request)

12403

-+{

12404

-+	struct bfq_queue *active_bfqq, *new_bfqq;

12405

-+

12406

-+	if (bfqq->new_bfqq)

12407

-+		return bfqq->new_bfqq;

12408

-+

12409

-+	if (!io_struct)

12410

-+		return NULL;

12411

-+

12412

-+	active_bfqq = bfqd->active_queue;

12413

-+

12414

-+	if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)

12415

-+		goto check_scheduled;

12416

-+

12417

-+	if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))

12418

-+		goto check_scheduled;

12419

-+

12420

-+	if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))

12421

-+		goto check_scheduled;

12422

-+

12423

-+	if (active_bfqq->entity.parent != bfqq->entity.parent)

12424

-+		goto check_scheduled;

12425

-+

12426

-+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

12427

-+	    bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))

12428

-+		if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))

12429

-+			return new_bfqq; /* Merge with the active queue */

12430

-+

12431

-+	/*

12432

-+	 * Check whether there is a cooperator among currently scheduled

12433

-+	 * queues. The only thing we need is that the bio/request is not

12434

-+	 * NULL, as we need it to establish whether a cooperator exists.

12435

-+	 */

12436

-+check_scheduled:

12437

-+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

12438

-+					bfq_io_struct_pos(io_struct, request));

12439

-+	if (new_bfqq)

12440

-+		return bfq_setup_merge(bfqq, new_bfqq);

12441

-+

12442

-+	return NULL;

12443

-+}

12444

-+

12445

-+static inline void

12446

-+bfq_bfqq_save_state(struct bfq_queue *bfqq)

12447

-+{

12448

-+	/*

12449

-+	 * If bfqq->bic == NULL, the queue is already shared or its requests

12450

-+	 * have already been redirected to a shared queue; both idle window

12451

-+	 * and weight raising state have already been saved. Do nothing.

12452

-+	 */

12453

-+	if (bfqq->bic == NULL)

12454

-+		return;

12455

-+	if (bfqq->bic->raising_time_left)

12456

-+		/*

12457

-+		 * This is the queue of a just-started process, and would

12458

-+		 * deserve weight raising: we set raising_time_left to the full

12459

-+		 * weight-raising duration to trigger weight-raising when and

12460

-+		 * if the queue is split and the first request of the queue

12461

-+		 * is enqueued.

12462

-+		 */

12463

-+		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

12464

-+	else if (bfqq->raising_coeff > 1) {

12465

-+		unsigned long wrais_duration =

12466

-+			jiffies - bfqq->last_rais_start_finish;

12467

-+		/*

12468

-+		 * It may happen that a queue's weight raising period lasts

12469

-+		 * longer than its raising_cur_max_time, as weight raising is

12470

-+		 * handled only when a request is enqueued or dispatched (it

12471

-+		 * does not use any timer). If the weight raising period is

12472

-+		 * about to end, don't save it.

12473

-+		 */

12474

-+		if (bfqq->raising_cur_max_time <= wrais_duration)

12475

-+			bfqq->bic->raising_time_left = 0;

12476

-+		else

12477

-+			bfqq->bic->raising_time_left =

12478

-+				bfqq->raising_cur_max_time - wrais_duration;

12479

-+		/*

12480

-+		 * The bfq_queue is becoming shared or the requests of the

12481

-+		 * process owning the queue are being redirected to a shared

12482

-+		 * queue. Stop the weight raising period of the queue, as in

12483

-+		 * both cases it should not be owned by an interactive or soft

12484

-+		 * real-time application.

12485

-+		 */

12486

-+		bfq_bfqq_end_raising(bfqq);

12487

-+	} else

12488

-+		bfqq->bic->raising_time_left = 0;

12489

-+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

12490

-+}

12491

-+

12492

-+static inline void

12493

-+bfq_get_bic_reference(struct bfq_queue *bfqq)

12494

-+{

12495

-+	/*

12496

-+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

12497

-+	 * is about to begin using a shared bfq_queue.

12498

-+	 */

12499

-+	if (bfqq->bic)

12500

-+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

12501

-+}

12502

-+

12503

-+static void

12504

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

12505

-+                struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12506

-+{

12507

-+        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

12508

-+		(long unsigned)new_bfqq->pid);

12509

-+	/* Save weight raising and idle window of the merged queues */

12510

-+	bfq_bfqq_save_state(bfqq);

12511

-+	bfq_bfqq_save_state(new_bfqq);

12512

-+	/*

12513

-+	 * Grab a reference to the bic, to prevent it from being destroyed

12514

-+	 * before being possibly touched by a bfq_split_bfqq().

12515

-+	 */

12516

-+	bfq_get_bic_reference(bfqq);

12517

-+	bfq_get_bic_reference(new_bfqq);

12518

-+	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

12519

-+        bic_set_bfqq(bic, new_bfqq, 1);

12520

-+        bfq_mark_bfqq_coop(new_bfqq);

12521

-+	/*

12522

-+	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

12523

-+	 * new_bfqq->bic to NULL. bfqq either:

12524

-+	 * - does not belong to any bic any more, and hence bfqq->bic must

12525

-+	 *   be set to NULL, or

12526

-+	 * - is a queue whose owning bics have already been redirected to a

12527

-+	 *   different queue, hence the queue is destined to not belong to any

12528

-+	 *   bic soon and bfqq->bic is already NULL (therefore the next

12529

-+	 *   assignment causes no harm).

12530

-+	 */

12531

-+	new_bfqq->bic = NULL;

12532

-+	bfqq->bic = NULL;

12533

-+        bfq_put_queue(bfqq);

12534

-+}

12535

-+

12536

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12537

-+			   struct bio *bio)

12538

-+{

12539

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

12540

-+	struct bfq_io_cq *bic;

12541

-+	struct bfq_queue *bfqq, *new_bfqq;

12542

-+

12543

-+	/*

12544

-+	 * Disallow merge of a sync bio into an async request.

12545

-+	 */

12546

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12547

-+		return 0;

12548

-+

12549

-+	/*

12550

-+	 * Lookup the bfqq that this bio will be queued with. Allow

12551

-+	 * merge only if rq is queued there.

12552

-+	 * Queue lock is held here.

12553

-+	 */

12554

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

12555

-+	if (bic == NULL)

12556

-+		return 0;

12557

-+

12558

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12559

-+	/*

12560

-+	 * We take advantage of this function to perform an early merge

12561

-+	 * of the queues of possible cooperating processes.

12562

-+	 */

12563

-+	if (bfqq != NULL &&

12564

-+	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {

12565

-+		bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

12566

-+		/*

12567

-+		 * If we get here, the bio will be queued in the shared queue,

12568

-+		 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

12569

-+		 * rq can be merged.

12570

-+		 */

12571

-+		bfqq = new_bfqq;

12572

-+	}

12573

-+

12574

-+	return bfqq == RQ_BFQQ(rq);

12575

-+}

12576

-+

12577

-+static void __bfq_set_active_queue(struct bfq_data *bfqd,

12578

-+				   struct bfq_queue *bfqq)

12579

-+{

12580

-+	if (bfqq != NULL) {

12581

-+		bfq_mark_bfqq_must_alloc(bfqq);

12582

-+		bfq_mark_bfqq_budget_new(bfqq);

12583

-+		bfq_clear_bfqq_fifo_expire(bfqq);

12584

-+

12585

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12586

-+

12587

-+		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

12588

-+			     bfqq->entity.budget);

12589

-+	}

12590

-+

12591

-+	bfqd->active_queue = bfqq;

12592

-+}

12593

-+

12594

-+/*

12595

-+ * Get and set a new active queue for service.

12596

-+ */

12597

-+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)

12598

-+{

12599

-+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

12600

-+

12601

-+	__bfq_set_active_queue(bfqd, bfqq);

12602

-+	return bfqq;

12603

-+}

12604

-+

12605

- /*

12606

-  * If enough samples have been computed, return the current max budget

12607

-  * stored in bfqd, which is dynamically updated according to the

12608

-@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

12609

- 	return rq;

12610

- }

12611

-

12612

--/*

12613

-- * Must be called with the queue_lock held.

12614

-- */

12615

--static int bfqq_process_refs(struct bfq_queue *bfqq)

12616

--{

12617

--	int process_refs, io_refs;

12618

--

12619

--	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12620

--	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12621

--	BUG_ON(process_refs < 0);

12622

--	return process_refs;

12623

--}

12624

--

12625

--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12626

--{

12627

--	int process_refs, new_process_refs;

12628

--	struct bfq_queue *__bfqq;

12629

--

12630

--	/*

12631

--	 * If there are no process references on the new_bfqq, then it is

12632

--	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12633

--	 * may have dropped their last reference (not just their last process

12634

--	 * reference).

12635

--	 */

12636

--	if (!bfqq_process_refs(new_bfqq))

12637

--		return;

12638

--

12639

--	/* Avoid a circular list and skip interim queue merges. */

12640

--	while ((__bfqq = new_bfqq->new_bfqq)) {

12641

--		if (__bfqq == bfqq)

12642

--			return;

12643

--		new_bfqq = __bfqq;

12644

--	}

12645

--

12646

--	process_refs = bfqq_process_refs(bfqq);

12647

--	new_process_refs = bfqq_process_refs(new_bfqq);

12648

--	/*

12649

--	 * If the process for the bfqq has gone away, there is no

12650

--	 * sense in merging the queues.

12651

--	 */

12652

--	if (process_refs == 0 || new_process_refs == 0)

12653

--		return;

12654

--

12655

--	/*

12656

--	 * Merge in the direction of the lesser amount of work.

12657

--	 */

12658

--	if (new_process_refs >= process_refs) {

12659

--		bfqq->new_bfqq = new_bfqq;

12660

--		atomic_add(process_refs, &new_bfqq->ref);

12661

--	} else {

12662

--		new_bfqq->new_bfqq = bfqq;

12663

--		atomic_add(new_process_refs, &bfqq->ref);

12664

--	}

12665

--	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12666

--		new_bfqq->pid);

12667

--}

12668

--

12669

- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

12670

- {

12671

- 	struct bfq_entity *entity = &bfqq->entity;

12672

-@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

12673

-  *   is likely to boost the disk throughput);

12674

-  * - the queue is weight-raised (waiting for the request is necessary for

12675

-  *   providing the queue with fairness and latency guarantees).

12676

-+ *

12677

-+ * In any case, idling can be disabled for cooperation issues, if

12678

-+ * 1) there is a close cooperator for the queue, or

12679

-+ * 2) the queue is shared and some cooperator is likely to be idle (in this

12680

-+ *    case, by not arming the idle timer, we try to slow down the queue, to

12681

-+ *    prevent the zones of the disk accessed by the active cooperators to

12682

-+ *    become too distant from the zone that will be accessed by the currently

12683

-+ *    idle cooperators).

12684

-  */

12685

- static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

12686

- 				      int budg_timeout)

12687

-@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

12688

- 		(bfqd->rq_in_driver == 0 ||

12689

- 				budg_timeout ||

12690

-                                 bfqq->raising_coeff > 1) &&

12691

--                !bfq_close_cooperator(bfqd, bfqq) &&

12692

-+                !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&

12693

-                 (!bfq_bfqq_coop(bfqq) ||

12694

- 			!bfq_bfqq_some_coop_idle(bfqq)) &&

12695

- 		!bfq_queue_nonrot_noidle(bfqd, bfqq));

12696

-@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

12697

-  */

12698

- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12699

- {

12700

--	struct bfq_queue *bfqq, *new_bfqq = NULL;

12701

-+	struct bfq_queue *bfqq;

12702

- 	struct request *next_rq;

12703

- 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

12704

- 	int budg_timeout;

12705

-@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12706

-

12707

- 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

12708

-

12709

--	/*

12710

--         * If another queue has a request waiting within our mean seek

12711

--         * distance, let it run. The expire code will check for close

12712

--         * cooperators and put the close queue at the front of the

12713

--         * service tree. If possible, merge the expiring queue with the

12714

--         * new bfqq.

12715

--         */

12716

--        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

12717

--        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

12718

--                bfq_setup_merge(bfqq, new_bfqq);

12719

--

12720

- 	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

12721

- 	if (budg_timeout &&

12722

- 	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

12723

-@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12724

- 				bfq_clear_bfqq_wait_request(bfqq);

12725

- 				del_timer(&bfqd->idle_slice_timer);

12726

- 			}

12727

--			if (new_bfqq == NULL)

12728

--				goto keep_queue;

12729

--			else

12730

--				goto expire;

12731

-+			goto keep_queue;

12732

- 		}

12733

- 	}

12734

-

12735

-@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12736

- 	 * queue still has requests in flight or is idling for a new request,

12737

- 	 * then keep it.

12738

- 	 */

12739

--	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

12740

-+	if (timer_pending(&bfqd->idle_slice_timer) ||

12741

- 	    (bfqq->dispatched != 0 &&

12742

- 	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

12743

--	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

12744

-+	     !bfq_queue_nonrot_noidle(bfqd, bfqq))) {

12745

- 		bfqq = NULL;

12746

- 		goto keep_queue;

12747

--	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

12748

--		/*

12749

--		 * Expiring the queue because there is a close cooperator,

12750

--		 * cancel timer.

12751

--		 */

12752

--		bfq_clear_bfqq_wait_request(bfqq);

12753

--		del_timer(&bfqd->idle_slice_timer);

12754

- 	}

12755

-

12756

- 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

12757

- expire:

12758

- 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

12759

- new_queue:

12760

--	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

12761

-+	bfqq = bfq_set_active_queue(bfqd);

12762

- 	bfq_log(bfqd, "select_queue: new queue %d returned",

12763

- 		bfqq != NULL ? bfqq->pid : 0);

12764

- keep_queue:

12765

-@@ -1617,9 +1807,8 @@ keep_queue:

12766

-

12767

- static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

12768

- {

12769

-+	struct bfq_entity *entity = &bfqq->entity;

12770

- 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

12771

--		struct bfq_entity *entity = &bfqq->entity;

12772

--

12773

- 		bfq_log_bfqq(bfqd, bfqq,

12774

- 			"raising period dur %u/%u msec, "

12775

- 			"old raising coeff %u, w %d(%d)",

12776

-@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

12777

- 					     jiffies_to_msecs(bfqq->

12778

- 						raising_cur_max_time));

12779

- 				bfq_bfqq_end_raising(bfqq);

12780

--				__bfq_entity_update_weight_prio(

12781

--					bfq_entity_service_tree(entity),

12782

--					entity);

12783

- 			}

12784

- 		}

12785

- 	}

12786

-+	/* Update weight both if it must be raised and if it must be lowered */

12787

-+	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

12788

-+		__bfq_entity_update_weight_prio(

12789

-+			bfq_entity_service_tree(entity),

12790

-+			entity);

12791

- }

12792

-

12793

- /*

12794

-@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)

12795

- 	struct bfq_io_cq *bic = icq_to_bic(icq);

12796

-

12797

- 	bic->ttime.last_end_request = jiffies;

12798

-+	/*

12799

-+	 * A newly created bic indicates that the process has just

12800

-+	 * started doing I/O, and is probably mapping into memory its

12801

-+	 * executable and libraries: it definitely needs weight raising.

12802

-+	 * There is however the possibility that the process performs,

12803

-+	 * for a while, I/O close to some other process. EQM intercepts

12804

-+	 * this behavior and may merge the queue corresponding to the

12805

-+	 * process  with some other queue, BEFORE the weight of the queue

12806

-+	 * is raised. Merged queues are not weight-raised (they are assumed

12807

-+	 * to belong to processes that benefit only from high throughput).

12808

-+	 * If the merge is basically the consequence of an accident, then

12809

-+	 * the queue will be split soon and will get back its old weight.

12810

-+	 * It is then important to write down somewhere that this queue

12811

-+	 * does need weight raising, even if it did not make it to get its

12812

-+	 * weight raised before being merged. To this purpose, we overload

12813

-+	 * the field raising_time_left and assign 1 to it, to mark the queue

12814

-+	 * as needing weight raising.

12815

-+	 */

12816

-+	bic->raising_time_left = 1;

12817

- }

12818

-

12819

- static void bfq_exit_icq(struct io_cq *icq)

12820

-@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)

12821

- 	}

12822

-

12823

- 	if (bic->bfqq[BLK_RW_SYNC]) {

12824

-+		/*

12825

-+		 * If the bic is using a shared queue, put the reference

12826

-+		 * taken on the io_context when the bic started using a

12827

-+		 * shared bfq_queue.

12828

-+		 */

12829

-+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

12830

-+			put_io_context(icq->ioc);

12831

- 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

12832

- 		bic->bfqq[BLK_RW_SYNC] = NULL;

12833

- 	}

12834

-@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

12835

- 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

12836

- 		return;

12837

-

12838

-+	/* Idle window just restored, statistics are meaningless. */

12839

-+	if (bfq_bfqq_just_split(bfqq))

12840

-+		return;

12841

-+

12842

- 	enable_idle = bfq_bfqq_idle_window(bfqq);

12843

-

12844

- 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

12845

-@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12846

- 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

12847

- 	    !BFQQ_SEEKY(bfqq))

12848

- 		bfq_update_idle_window(bfqd, bfqq, bic);

12849

-+	bfq_clear_bfqq_just_split(bfqq);

12850

-

12851

- 	bfq_log_bfqq(bfqd, bfqq,

12852

- 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

12853

-@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12854

- static void bfq_insert_request(struct request_queue *q, struct request *rq)

12855

- {

12856

- 	struct bfq_data *bfqd = q->elevator->elevator_data;

12857

--	struct bfq_queue *bfqq = RQ_BFQQ(rq);

12858

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

12859

-

12860

- 	assert_spin_locked(bfqd->queue->queue_lock);

12861

-+

12862

-+	/*

12863

-+	 * An unplug may trigger a requeue of a request from the device

12864

-+	 * driver: make sure we are in process context while trying to

12865

-+	 * merge two bfq_queues.

12866

-+	 */

12867

-+	if (!in_interrupt() &&

12868

-+	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {

12869

-+		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

12870

-+			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

12871

-+		/*

12872

-+		 * Release the request's reference to the old bfqq

12873

-+		 * and make sure one is taken to the shared queue.

12874

-+		 */

12875

-+		new_bfqq->allocated[rq_data_dir(rq)]++;

12876

-+		bfqq->allocated[rq_data_dir(rq)]--;

12877

-+		atomic_inc(&new_bfqq->ref);

12878

-+		bfq_put_queue(bfqq);

12879

-+		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

12880

-+			bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);

12881

-+		rq->elv.priv[1] = new_bfqq;

12882

-+		bfqq = new_bfqq;

12883

-+	}

12884

-+

12885

- 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

12886

-

12887

- 	bfq_add_rq_rb(rq);

12888

-

12889

-+	/*

12890

-+	 * Here a newly-created bfq_queue has already started a weight-raising

12891

-+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

12892

-+	 * from assigning it a full weight-raising period. See the detailed

12893

-+	 * comments about this field in bfq_init_icq().

12894

-+	 */

12895

-+	if (bfqq->bic != NULL)

12896

-+		bfqq->bic->raising_time_left = 0;

12897

- 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

12898

- 	list_add_tail(&rq->queuelist, &bfqq->fifo);

12899

-

12900

-@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)

12901

- 		if (bfq_bfqq_budget_new(bfqq))

12902

- 			bfq_set_budget_timeout(bfqd);

12903

-

12904

--		/* Idling is disabled also for cooperation issues:

12905

--		 * 1) there is a close cooperator for the queue, or

12906

--		 * 2) the queue is shared and some cooperator is likely

12907

--		 *    to be idle (in this case, by not arming the idle timer,

12908

--		 *    we try to slow down the queue, to prevent the zones

12909

--		 *    of the disk accessed by the active cooperators to become

12910

--		 *    too distant from the zone that will be accessed by the

12911

--		 *    currently idle cooperators)

12912

--		 */

12913

- 		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

12914

- 			bfq_arm_slice_timer(bfqd);

12915

- 		else if (budg_timeout)

12916

-@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)

12917

- 	}

12918

- }

12919

-

12920

--static struct bfq_queue *

12921

--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

12922

--                struct bfq_queue *bfqq)

12923

--{

12924

--        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

12925

--		(long unsigned)bfqq->new_bfqq->pid);

12926

--        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

12927

--        bfq_mark_bfqq_coop(bfqq->new_bfqq);

12928

--        bfq_put_queue(bfqq);

12929

--        return bic_to_bfqq(bic, 1);

12930

--}

12931

--

12932

- /*

12933

-  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

12934

-  * was the last process referring to said bfqq.

12935

-@@ -2469,6 +2702,9 @@ static struct bfq_queue *

12936

- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

12937

- {

12938

- 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

12939

-+

12940

-+	put_io_context(bic->icq.ioc);

12941

-+

12942

- 	if (bfqq_process_refs(bfqq) == 1) {

12943

- 		bfqq->pid = current->pid;

12944

- 		bfq_clear_bfqq_some_coop_idle(bfqq);

12945

-@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

12946

- 	struct bfq_queue *bfqq;

12947

- 	struct bfq_group *bfqg;

12948

- 	unsigned long flags;

12949

-+	bool split = false;

12950

-

12951

- 	might_sleep_if(gfp_mask & __GFP_WAIT);

12952

-

12953

-@@ -2516,24 +2753,14 @@ new_queue:

12954

- 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

12955

- 		bic_set_bfqq(bic, bfqq, is_sync);

12956

- 	} else {

12957

--		/*

12958

--		 * If the queue was seeky for too long, break it apart.

12959

--		 */

12960

-+		/* If the queue was seeky for too long, break it apart. */

12961

- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

12962

- 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

12963

- 			bfqq = bfq_split_bfqq(bic, bfqq);

12964

-+			split = true;

12965

- 			if (!bfqq)

12966

- 				goto new_queue;

12967

- 		}

12968

--

12969

--		/*

12970

--		 * Check to see if this queue is scheduled to merge with

12971

--		 * another closely cooperating queue. The merging of queues

12972

--		 * happens here as it must be done in process context.

12973

--		 * The reference on new_bfqq was taken in merge_bfqqs.

12974

--		 */

12975

--		if (bfqq->new_bfqq != NULL)

12976

--			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

12977

- 	}

12978

-

12979

- 	bfqq->allocated[rw]++;

12980

-@@ -2544,6 +2771,26 @@ new_queue:

12981

- 	rq->elv.priv[0] = bic;

12982

- 	rq->elv.priv[1] = bfqq;

12983

-

12984

-+	/*

12985

-+	 * If a bfq_queue has only one process reference, it is owned

12986

-+	 * by only one bfq_io_cq: we can set the bic field of the

12987

-+	 * bfq_queue to the address of that structure. Also, if the

12988

-+	 * queue has just been split, mark a flag so that the

12989

-+	 * information is available to the other scheduler hooks.

12990

-+	 */

12991

-+	if (bfqq_process_refs(bfqq) == 1) {

12992

-+		bfqq->bic = bic;

12993

-+		if (split) {

12994

-+			bfq_mark_bfqq_just_split(bfqq);

12995

-+			/*

12996

-+			 * If the queue has just been split from a shared queue,

12997

-+			 * restore the idle window and the possible weight

12998

-+			 * raising period.

12999

-+			 */

13000

-+			bfq_bfqq_resume_state(bfqq, bic);

13001

-+		}

13002

-+	}

13003

-+

13004

- 	spin_unlock_irqrestore(q->queue_lock, flags);

13005

-

13006

- 	return 0;

13007

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

13008

-index 03f8061..a0edaa2 100644

13009

---- a/block/bfq-sched.c

13010

-+++ b/block/bfq-sched.c

13011

-@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

13012

- 	return bfqq;

13013

- }

13014

-

13015

--/*

13016

-- * Forced extraction of the given queue.

13017

-- */

13018

--static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

13019

--				      struct bfq_queue *bfqq)

13020

--{

13021

--	struct bfq_entity *entity;

13022

--	struct bfq_sched_data *sd;

13023

--

13024

--	BUG_ON(bfqd->active_queue != NULL);

13025

--

13026

--	entity = &bfqq->entity;

13027

--	/*

13028

--	 * Bubble up extraction/update from the leaf to the root.

13029

--	*/

13030

--	for_each_entity(entity) {

13031

--		sd = entity->sched_data;

13032

--		bfq_update_budget(entity);

13033

--		bfq_update_vtime(bfq_entity_service_tree(entity));

13034

--		bfq_active_extract(bfq_entity_service_tree(entity), entity);

13035

--		sd->active_entity = entity;

13036

--		sd->next_active = NULL;

13037

--		entity->service = 0;

13038

--	}

13039

--

13040

--	return;

13041

--}

13042

--

13043

- static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

13044

- {

13045

- 	if (bfqd->active_bic != NULL) {

13046

-diff --git a/block/bfq.h b/block/bfq.h

13047

-index 48ecde9..bb52975 100644

13048

---- a/block/bfq.h

13049

-+++ b/block/bfq.h

13050

-@@ -188,6 +188,8 @@ struct bfq_group;

13051

-  * @pid: pid of the process owning the queue, used for logging purposes.

13052

-  * @last_rais_start_time: last (idle -> weight-raised) transition attempt

13053

-  * @raising_cur_max_time: current max raising time for this queue

13054

-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

13055

-+ *	 queue is shared

13056

-  *

13057

-  * A bfq_queue is a leaf request queue; it can be associated to an io_context

13058

-  * or more (if it is an async one).  @cgroup holds a reference to the

13059

-@@ -231,6 +233,7 @@ struct bfq_queue {

13060

- 	sector_t last_request_pos;

13061

-

13062

- 	pid_t pid;

13063

-+	struct bfq_io_cq *bic;

13064

-

13065

- 	/* weight-raising fields */

13066

- 	unsigned int raising_cur_max_time;

13067

-@@ -257,12 +260,23 @@ struct bfq_ttime {

13068

-  * @icq: associated io_cq structure

13069

-  * @bfqq: array of two process queues, the sync and the async

13070

-  * @ttime: associated @bfq_ttime struct

13071

-+ * @raising_time_left: snapshot of the time left before weight raising ends

13072

-+ *		       for the sync queue associated to this process; this

13073

-+ *		       snapshot is taken to remember this value while the weight

13074

-+ *		       raising is suspended because the queue is merged with a

13075

-+ *		       shared queue, and is used to set @raising_cur_max_time

13076

-+ *		       when the queue is split from the shared queue and its

13077

-+ *		       weight is raised again

13078

-+ * @saved_idle_window: same purpose as the previous field for the idle window

13079

-  */

13080

- struct bfq_io_cq {

13081

- 	struct io_cq icq; /* must be the first member */

13082

- 	struct bfq_queue *bfqq[2];

13083

- 	struct bfq_ttime ttime;

13084

- 	int ioprio;

13085

-+

13086

-+	unsigned int raising_time_left;

13087

-+	unsigned int saved_idle_window;

13088

- };

13089

-

13090

- /**

13091

-@@ -403,6 +417,7 @@ enum bfqq_state_flags {

13092

- 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

13093

- 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

13094

- 	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

13095

-+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

13096

- };

13097

-

13098

- #define BFQ_BFQQ_FNS(name)						\

13099

-@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);

13100

- BFQ_BFQQ_FNS(coop);

13101

- BFQ_BFQQ_FNS(split_coop);

13102

- BFQ_BFQQ_FNS(some_coop_idle);

13103

-+BFQ_BFQQ_FNS(just_split);

13104

- #undef BFQ_BFQQ_FNS

13105

-

13106

- /* Logging facilities. */

13107

---

13108

-1.8.1.4

13109

-

13110

13111

Added: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

13112

===================================================================

13113

--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	                        (rev 0)

13114

+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	2014-01-30 16:49:47 UTC (rev 2661)

13115

@@ -0,0 +1,1034 @@

13116

+From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001

13117

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

13118

+Date: Thu, 23 Jan 2014 16:54:44 +0100

13119

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for

13120

+ 3.13.0

13121

+

13122

+A set of processes may happen  to  perform interleaved reads, i.e., requests

13123

+whose union would give rise to a  sequential read  pattern.  There are two

13124

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

13125

+data at a fixed distance from each other, while in the second case processes

13126

+may read variable-size chunks at  variable distances. The latter case occurs

13127

+for  example with  KVM, which  splits the  I/O generated  by the  guest into

13128

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

13129

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

13130

+available  process. CFQ  uses actual  queue merging  for the  first type  of

13131

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

13132

+of the read requests  performed by the second type of  processes. In the end

13133

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

13134

+throughput with interleaved I/O.

13135

+

13136

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

13137

+sequential  read pattern  with both  types of  processes. The  main idea  is

13138

+checking newly arrived requests against the next request of the active queue

13139

+both in case of actual request insert and in case of request merge. By doing

13140

+so, both the types of processes can be handled by just merging their queues.

13141

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

13142

+CFQ.

13143

+

13144

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

13145

+properly restoring the weight-raising state of  a queue when it gets back to

13146

+a non-merged state.

13147

+

13148

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

13149

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

13150

+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

13151

+---

13152

+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

13153

+ block/bfq-sched.c   |  28 ---

13154

+ block/bfq.h         |  16 ++

13155

+ 3 files changed, 474 insertions(+), 227 deletions(-)

13156

+

13157

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

13158

+index 7670400..295236e 100644

13159

+--- a/block/bfq-iosched.c

13160

++++ b/block/bfq-iosched.c

13161

+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

13162

+ 	return dur;

13163

+ }

13164

+

13165

++static inline void

13166

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

13167

++{

13168

++	if (bic->saved_idle_window)

13169

++		bfq_mark_bfqq_idle_window(bfqq);

13170

++	else

13171

++		bfq_clear_bfqq_idle_window(bfqq);

13172

++	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

13173

++		/*

13174

++		 * Start a weight raising period with the duration given by

13175

++		 * the raising_time_left snapshot.

13176

++		 */

13177

++		if (bfq_bfqq_busy(bfqq))

13178

++			bfqq->bfqd->raised_busy_queues++;

13179

++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

13180

++		bfqq->raising_cur_max_time = bic->raising_time_left;

13181

++		bfqq->last_rais_start_finish = jiffies;

13182

++		bfqq->entity.ioprio_changed = 1;

13183

++	}

13184

++	/*

13185

++	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

13186

++	 * getting confused about the queue's need of a weight-raising

13187

++	 * period.

13188

++	 */

13189

++	bic->raising_time_left = 0;

13190

++}

13191

++

13192

++/*

13193

++ * Must be called with the queue_lock held.

13194

++ */

13195

++static int bfqq_process_refs(struct bfq_queue *bfqq)

13196

++{

13197

++	int process_refs, io_refs;

13198

++

13199

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13200

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13201

++	BUG_ON(process_refs < 0);

13202

++	return process_refs;

13203

++}

13204

++

13205

+ static void bfq_add_rq_rb(struct request *rq)

13206

+ {

13207

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13208

+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

13209

+ 		if (!bfqd->low_latency)

13210

+ 			goto add_bfqq_busy;

13211

+

13212

++		if (bfq_bfqq_just_split(bfqq))

13213

++			goto set_ioprio_changed;

13214

++

13215

+ 		/*

13216

+-		 * If the queue is not being boosted and has been idle

13217

+-		 * for enough time, start a weight-raising period

13218

++		 * If the queue:

13219

++		 * - is not being boosted,

13220

++		 * - has been idle for enough time,

13221

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

13222

++		 *   shared "for its nature" or it is not shared and its

13223

++		 *   requests have not been redirected to a shared queue)

13224

++		 * start a weight-raising period.

13225

+ 		 */

13226

+-		if (old_raising_coeff == 1 &&

13227

+-		    (idle_for_long_time || soft_rt)) {

13228

++		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

13229

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

13230

+ 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

13231

+ 			if (idle_for_long_time)

13232

+ 				bfqq->raising_cur_max_time =

13233

+@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)

13234

+ 					bfqd->bfq_raising_rt_max_time;

13235

+ 			}

13236

+ 		}

13237

++set_ioprio_changed:

13238

+ 		if (old_raising_coeff != bfqq->raising_coeff)

13239

+ 			entity->ioprio_changed = 1;

13240

+ add_bfqq_busy:

13241

+@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

13242

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

13243

+ }

13244

+

13245

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13246

+-			   struct bio *bio)

13247

+-{

13248

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

13249

+-	struct bfq_io_cq *bic;

13250

+-	struct bfq_queue *bfqq;

13251

+-

13252

+-	/*

13253

+-	 * Disallow merge of a sync bio into an async request.

13254

+-	 */

13255

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13256

+-		return 0;

13257

+-

13258

+-	/*

13259

+-	 * Lookup the bfqq that this bio will be queued with. Allow

13260

+-	 * merge only if rq is queued there.

13261

+-	 * Queue lock is held here.

13262

+-	 */

13263

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

13264

+-	if (bic == NULL)

13265

+-		return 0;

13266

+-

13267

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13268

+-	return bfqq == RQ_BFQQ(rq);

13269

+-}

13270

+-

13271

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13272

+-				       struct bfq_queue *bfqq)

13273

+-{

13274

+-	if (bfqq != NULL) {

13275

+-		bfq_mark_bfqq_must_alloc(bfqq);

13276

+-		bfq_mark_bfqq_budget_new(bfqq);

13277

+-		bfq_clear_bfqq_fifo_expire(bfqq);

13278

+-

13279

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13280

+-

13281

+-		bfq_log_bfqq(bfqd, bfqq,

13282

+-			     "set_in_service_queue, cur-budget = %lu",

13283

+-			     bfqq->entity.budget);

13284

+-	}

13285

+-

13286

+-	bfqd->in_service_queue = bfqq;

13287

+-}

13288

+-

13289

+-/*

13290

+- * Get and set a new queue for service.

13291

+- */

13292

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

13293

+-						  struct bfq_queue *bfqq)

13294

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

13295

+ {

13296

+-	if (!bfqq)

13297

+-		bfqq = bfq_get_next_queue(bfqd);

13298

++	if (request)

13299

++		return blk_rq_pos(io_struct);

13300

+ 	else

13301

+-		bfq_get_next_queue_forced(bfqd, bfqq);

13302

+-

13303

+-	__bfq_set_in_service_queue(bfqd, bfqq);

13304

+-	return bfqq;

13305

++		return ((struct bio *)io_struct)->bi_sector;

13306

+ }

13307

+

13308

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

13309

+-					  struct request *rq)

13310

++static inline sector_t bfq_dist_from(sector_t pos1,

13311

++				     sector_t pos2)

13312

+ {

13313

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

13314

+-		return blk_rq_pos(rq) - bfqd->last_position;

13315

++	if (pos1 >= pos2)

13316

++		return pos1 - pos2;

13317

+ 	else

13318

+-		return bfqd->last_position - blk_rq_pos(rq);

13319

++		return pos2 - pos1;

13320

+ }

13321

+

13322

+-/*

13323

+- * Return true if bfqq has no request pending and rq is close enough to

13324

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

13325

+- * bfqq->next_rq

13326

+- */

13327

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

13328

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

13329

++					 sector_t sector)

13330

+ {

13331

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

13332

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

13333

++	       BFQQ_SEEK_THR;

13334

+ }

13335

+

13336

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13337

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

13338

+ {

13339

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

13340

+ 	struct rb_node *parent, *node;

13341

+ 	struct bfq_queue *__bfqq;

13342

+-	sector_t sector = bfqd->last_position;

13343

+

13344

+ 	if (RB_EMPTY_ROOT(root))

13345

+ 		return NULL;

13346

+@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13347

+ 	 * position).

13348

+ 	 */

13349

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

13350

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13351

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13352

+ 		return __bfqq;

13353

+

13354

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

13355

+@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13356

+ 		return NULL;

13357

+

13358

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

13359

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13360

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13361

+ 		return __bfqq;

13362

+

13363

+ 	return NULL;

13364

+@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13365

+ /*

13366

+  * bfqd - obvious

13367

+  * cur_bfqq - passed in so that we don't decide that the current queue

13368

+- *            is closely cooperating with itself.

13369

+- *

13370

+- * We are assuming that cur_bfqq has dispatched at least one request,

13371

+- * and that bfqd->last_position reflects a position on the disk associated

13372

+- * with the I/O issued by cur_bfqq.

13373

++ *            is closely cooperating with itself

13374

++ * sector - used as a reference point to search for a close queue

13375

+  */

13376

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13377

+-					      struct bfq_queue *cur_bfqq)

13378

++					      struct bfq_queue *cur_bfqq,

13379

++					      sector_t sector)

13380

+ {

13381

+ 	struct bfq_queue *bfqq;

13382

+

13383

+@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13384

+ 	 * working closely on the same area of the disk. In that case,

13385

+ 	 * we can group them together and don't waste time idling.

13386

+ 	 */

13387

+-	bfqq = bfqq_close(bfqd);

13388

++	bfqq = bfqq_close(bfqd, sector);

13389

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

13390

+ 		return NULL;

13391

+

13392

+@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13393

+ 	return bfqq;

13394

+ }

13395

+

13396

++static struct bfq_queue *

13397

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13398

++{

13399

++	int process_refs, new_process_refs;

13400

++	struct bfq_queue *__bfqq;

13401

++

13402

++	/*

13403

++	 * If there are no process references on the new_bfqq, then it is

13404

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13405

++	 * may have dropped their last reference (not just their last process

13406

++	 * reference).

13407

++	 */

13408

++	if (!bfqq_process_refs(new_bfqq))

13409

++		return NULL;

13410

++

13411

++	/* Avoid a circular list and skip interim queue merges. */

13412

++	while ((__bfqq = new_bfqq->new_bfqq)) {

13413

++		if (__bfqq == bfqq)

13414

++			return NULL;

13415

++		new_bfqq = __bfqq;

13416

++	}

13417

++

13418

++	process_refs = bfqq_process_refs(bfqq);

13419

++	new_process_refs = bfqq_process_refs(new_bfqq);

13420

++	/*

13421

++	 * If the process for the bfqq has gone away, there is no

13422

++	 * sense in merging the queues.

13423

++	 */

13424

++	if (process_refs == 0 || new_process_refs == 0)

13425

++		return NULL;

13426

++

13427

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13428

++		new_bfqq->pid);

13429

++

13430

++	/*

13431

++	 * Merging is just a redirection: the requests of the process owning

13432

++	 * one of the two queues are redirected to the other queue. The latter

13433

++	 * queue, in its turn, is set as shared if this is the first time that

13434

++	 * the requests of some process are redirected to it.

13435

++	 *

13436

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

13437

++	 * are in the context of the process owning bfqq, hence we have the

13438

++	 * io_cq of this process. So we can immediately configure this io_cq

13439

++	 * to redirect the requests of the process to new_bfqq.

13440

++	 *

13441

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

13442

++	 * io_cq of new_bfqq is not available, because, if the in-service queue

13443

++	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

13444

++	 * in-service queue.

13445

++	 * Redirecting the requests of the process owning bfqq to the currently

13446

++	 * in-service queue is in any case the best option, as we feed the

13447

++	 * in-service queue with new requests close to the last request served

13448

++	 * and, by doing so, hopefully increase the throughput.

13449

++	 */

13450

++	bfqq->new_bfqq = new_bfqq;

13451

++	atomic_add(process_refs, &new_bfqq->ref);

13452

++	return new_bfqq;

13453

++}

13454

++

13455

++/*

13456

++ * Attempt to schedule a merge of bfqq with the currently in-service queue or

13457

++ * with a close queue among the scheduled queues.

13458

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

13459

++ * structure otherwise.

13460

++ */

13461

++static struct bfq_queue *

13462

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13463

++		     void *io_struct, bool request)

13464

++{

13465

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

13466

++

13467

++	if (bfqq->new_bfqq)

13468

++		return bfqq->new_bfqq;

13469

++

13470

++	if (!io_struct)

13471

++		return NULL;

13472

++

13473

++	in_service_bfqq = bfqd->in_service_queue;

13474

++

13475

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

13476

++	    !bfqd->in_service_bic)

13477

++		goto check_scheduled;

13478

++

13479

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

13480

++		goto check_scheduled;

13481

++

13482

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

13483

++		goto check_scheduled;

13484

++

13485

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

13486

++		goto check_scheduled;

13487

++

13488

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

13489

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

13490

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

13491

++		if (new_bfqq != NULL)

13492

++			return new_bfqq; /* Merge with the in-service queue */

13493

++	}

13494

++

13495

++	/*

13496

++	 * Check whether there is a cooperator among currently scheduled

13497

++	 * queues. The only thing we need is that the bio/request is not

13498

++	 * NULL, as we need it to establish whether a cooperator exists.

13499

++	 */

13500

++check_scheduled:

13501

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

13502

++					bfq_io_struct_pos(io_struct, request));

13503

++	if (new_bfqq)

13504

++		return bfq_setup_merge(bfqq, new_bfqq);

13505

++

13506

++	return NULL;

13507

++}

13508

++

13509

++static inline void

13510

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

13511

++{

13512

++	/*

13513

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

13514

++	 * have already been redirected to a shared queue; both idle window

13515

++	 * and weight raising state have already been saved. Do nothing.

13516

++	 */

13517

++	if (bfqq->bic == NULL)

13518

++		return;

13519

++	if (bfqq->bic->raising_time_left)

13520

++		/*

13521

++		 * This is the queue of a just-started process, and would

13522

++		 * deserve weight raising: we set raising_time_left to the full

13523

++		 * weight-raising duration to trigger weight-raising when and

13524

++		 * if the queue is split and the first request of the queue

13525

++		 * is enqueued.

13526

++		 */

13527

++		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

13528

++	else if (bfqq->raising_coeff > 1) {

13529

++		unsigned long wrais_duration =

13530

++			jiffies - bfqq->last_rais_start_finish;

13531

++		/*

13532

++		 * It may happen that a queue's weight raising period lasts

13533

++		 * longer than its raising_cur_max_time, as weight raising is

13534

++		 * handled only when a request is enqueued or dispatched (it

13535

++		 * does not use any timer). If the weight raising period is

13536

++		 * about to end, don't save it.

13537

++		 */

13538

++		if (bfqq->raising_cur_max_time <= wrais_duration)

13539

++			bfqq->bic->raising_time_left = 0;

13540

++		else

13541

++			bfqq->bic->raising_time_left =

13542

++				bfqq->raising_cur_max_time - wrais_duration;

13543

++		/*

13544

++		 * The bfq_queue is becoming shared or the requests of the

13545

++		 * process owning the queue are being redirected to a shared

13546

++		 * queue. Stop the weight raising period of the queue, as in

13547

++		 * both cases it should not be owned by an interactive or soft

13548

++		 * real-time application.

13549

++		 */

13550

++		bfq_bfqq_end_raising(bfqq);

13551

++	} else

13552

++		bfqq->bic->raising_time_left = 0;

13553

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

13554

++}

13555

++

13556

++static inline void

13557

++bfq_get_bic_reference(struct bfq_queue *bfqq)

13558

++{

13559

++	/*

13560

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

13561

++	 * is about to begin using a shared bfq_queue.

13562

++	 */

13563

++	if (bfqq->bic)

13564

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

13565

++}

13566

++

13567

++static void

13568

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13569

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13570

++{

13571

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13572

++		(long unsigned)new_bfqq->pid);

13573

++	/* Save weight raising and idle window of the merged queues */

13574

++	bfq_bfqq_save_state(bfqq);

13575

++	bfq_bfqq_save_state(new_bfqq);

13576

++	/*

13577

++	 * Grab a reference to the bic, to prevent it from being destroyed

13578

++	 * before being possibly touched by a bfq_split_bfqq().

13579

++	 */

13580

++	bfq_get_bic_reference(bfqq);

13581

++	bfq_get_bic_reference(new_bfqq);

13582

++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

13583

++	bic_set_bfqq(bic, new_bfqq, 1);

13584

++	bfq_mark_bfqq_coop(new_bfqq);

13585

++	/*

13586

++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

13587

++	 * new_bfqq->bic to NULL. bfqq either:

13588

++	 * - does not belong to any bic any more, and hence bfqq->bic must

13589

++	 *   be set to NULL, or

13590

++	 * - is a queue whose owning bics have already been redirected to a

13591

++	 *   different queue, hence the queue is destined to not belong to any

13592

++	 *   bic soon and bfqq->bic is already NULL (therefore the next

13593

++	 *   assignment causes no harm).

13594

++	 */

13595

++	new_bfqq->bic = NULL;

13596

++	bfqq->bic = NULL;

13597

++	bfq_put_queue(bfqq);

13598

++}

13599

++

13600

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13601

++			   struct bio *bio)

13602

++{

13603

++	struct bfq_data *bfqd = q->elevator->elevator_data;

13604

++	struct bfq_io_cq *bic;

13605

++	struct bfq_queue *bfqq, *new_bfqq;

13606

++

13607

++	/*

13608

++	 * Disallow merge of a sync bio into an async request.

13609

++	 */

13610

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13611

++		return 0;

13612

++

13613

++	/*

13614

++	 * Lookup the bfqq that this bio will be queued with. Allow

13615

++	 * merge only if rq is queued there.

13616

++	 * Queue lock is held here.

13617

++	 */

13618

++	bic = bfq_bic_lookup(bfqd, current->io_context);

13619

++	if (bic == NULL)

13620

++		return 0;

13621

++

13622

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13623

++	/*

13624

++	 * We take advantage of this function to perform an early merge

13625

++	 * of the queues of possible cooperating processes.

13626

++	 */

13627

++	if (bfqq != NULL) {

13628

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

13629

++		if (new_bfqq != NULL) {

13630

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

13631

++			/*

13632

++			 * If we get here, the bio will be queued in the shared queue,

13633

++			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

13634

++			 * rq can be merged.

13635

++			 */

13636

++			bfqq = new_bfqq;

13637

++		}

13638

++	}

13639

++

13640

++	return bfqq == RQ_BFQQ(rq);

13641

++}

13642

++

13643

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13644

++				       struct bfq_queue *bfqq)

13645

++{

13646

++	if (bfqq != NULL) {

13647

++		bfq_mark_bfqq_must_alloc(bfqq);

13648

++		bfq_mark_bfqq_budget_new(bfqq);

13649

++		bfq_clear_bfqq_fifo_expire(bfqq);

13650

++

13651

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13652

++

13653

++		bfq_log_bfqq(bfqd, bfqq,

13654

++			     "set_in_service_queue, cur-budget = %lu",

13655

++			     bfqq->entity.budget);

13656

++	}

13657

++

13658

++	bfqd->in_service_queue = bfqq;

13659

++}

13660

++

13661

++/*

13662

++ * Get and set a new queue for service.

13663

++ */

13664

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

13665

++{

13666

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

13667

++

13668

++	__bfq_set_in_service_queue(bfqd, bfqq);

13669

++	return bfqq;

13670

++}

13671

++

13672

+ /*

13673

+  * If enough samples have been computed, return the current max budget

13674

+  * stored in bfqd, which is dynamically updated according to the

13675

+@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

13676

+ 	return rq;

13677

+ }

13678

+

13679

+-/*

13680

+- * Must be called with the queue_lock held.

13681

+- */

13682

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

13683

+-{

13684

+-	int process_refs, io_refs;

13685

+-

13686

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13687

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13688

+-	BUG_ON(process_refs < 0);

13689

+-	return process_refs;

13690

+-}

13691

+-

13692

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13693

+-{

13694

+-	int process_refs, new_process_refs;

13695

+-	struct bfq_queue *__bfqq;

13696

+-

13697

+-	/*

13698

+-	 * If there are no process references on the new_bfqq, then it is

13699

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13700

+-	 * may have dropped their last reference (not just their last process

13701

+-	 * reference).

13702

+-	 */

13703

+-	if (!bfqq_process_refs(new_bfqq))

13704

+-		return;

13705

+-

13706

+-	/* Avoid a circular list and skip interim queue merges. */

13707

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

13708

+-		if (__bfqq == bfqq)

13709

+-			return;

13710

+-		new_bfqq = __bfqq;

13711

+-	}

13712

+-

13713

+-	process_refs = bfqq_process_refs(bfqq);

13714

+-	new_process_refs = bfqq_process_refs(new_bfqq);

13715

+-	/*

13716

+-	 * If the process for the bfqq has gone away, there is no

13717

+-	 * sense in merging the queues.

13718

+-	 */

13719

+-	if (process_refs == 0 || new_process_refs == 0)

13720

+-		return;

13721

+-

13722

+-	/*

13723

+-	 * Merge in the direction of the lesser amount of work.

13724

+-	 */

13725

+-	if (new_process_refs >= process_refs) {

13726

+-		bfqq->new_bfqq = new_bfqq;

13727

+-		atomic_add(process_refs, &new_bfqq->ref);

13728

+-	} else {

13729

+-		new_bfqq->new_bfqq = bfqq;

13730

+-		atomic_add(new_process_refs, &bfqq->ref);

13731

+-	}

13732

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13733

+-		new_bfqq->pid);

13734

+-}

13735

+-

13736

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

13737

+ {

13738

+ 	struct bfq_entity *entity = &bfqq->entity;

13739

+@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

13740

+  */

13741

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13742

+ {

13743

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

13744

++	struct bfq_queue *bfqq;

13745

+ 	struct request *next_rq;

13746

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

13747

+

13748

+@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13749

+

13750

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

13751

+

13752

+-	/*

13753

+-         * If another queue has a request waiting within our mean seek

13754

+-         * distance, let it run. The expire code will check for close

13755

+-         * cooperators and put the close queue at the front of the

13756

+-         * service tree. If possible, merge the expiring queue with the

13757

+-         * new bfqq.

13758

+-         */

13759

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

13760

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

13761

+-                bfq_setup_merge(bfqq, new_bfqq);

13762

+-

13763

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

13764

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

13765

+ 	    !bfq_bfqq_must_idle(bfqq))

13766

+@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13767

+ 				bfq_clear_bfqq_wait_request(bfqq);

13768

+ 				del_timer(&bfqd->idle_slice_timer);

13769

+ 			}

13770

+-			if (new_bfqq == NULL)

13771

+-				goto keep_queue;

13772

+-			else

13773

+-				goto expire;

13774

++			goto keep_queue;

13775

+ 		}

13776

+ 	}

13777

+

13778

+ 	/*

13779

+-	 * No requests pending.  If the in-service queue has no cooperator and

13780

+-	 * still has requests in flight (possibly waiting for a completion)

13781

+-	 * or is idling for a new request, then keep it.

13782

++	 * No requests pending.  If the in-service queue still has requests in

13783

++	 * flight (possibly waiting for a completion) or is idling for a new

13784

++	 * request, then keep it.

13785

+ 	 */

13786

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

13787

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

13788

++	if (timer_pending(&bfqd->idle_slice_timer) ||

13789

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

13790

+ 		bfqq = NULL;

13791

+ 		goto keep_queue;

13792

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

13793

+-		/*

13794

+-		 * Expiring the queue because there is a close cooperator,

13795

+-		 * cancel timer.

13796

+-		 */

13797

+-		bfq_clear_bfqq_wait_request(bfqq);

13798

+-		del_timer(&bfqd->idle_slice_timer);

13799

+ 	}

13800

+

13801

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

13802

+ expire:

13803

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

13804

+ new_queue:

13805

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

13806

++	bfqq = bfq_set_in_service_queue(bfqd);

13807

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

13808

+ 		bfqq != NULL ? bfqq->pid : 0);

13809

+ keep_queue:

13810

+@@ -1799,9 +1989,8 @@ keep_queue:

13811

+ static void bfq_update_raising_data(struct bfq_data *bfqd,

13812

+ 				    struct bfq_queue *bfqq)

13813

+ {

13814

++	struct bfq_entity *entity = &bfqq->entity;

13815

+ 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

13816

+-		struct bfq_entity *entity = &bfqq->entity;

13817

+-

13818

+ 		bfq_log_bfqq(bfqd, bfqq,

13819

+ 			"raising period dur %u/%u msec, "

13820

+ 			"old raising coeff %u, w %d(%d)",

13821

+@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13822

+ 			"WARN: pending prio change");

13823

+ 		/*

13824

+ 		 * If too much time has elapsed from the beginning

13825

+-		 * of this weight-raising, stop it.

13826

++		 * of this weight-raising period, stop it.

13827

+ 		 */

13828

+ 		if (jiffies - bfqq->last_rais_start_finish >

13829

+ 			bfqq->raising_cur_max_time) {

13830

+@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13831

+ 				     jiffies_to_msecs(bfqq->

13832

+ 					raising_cur_max_time));

13833

+ 			bfq_bfqq_end_raising(bfqq);

13834

+-			__bfq_entity_update_weight_prio(

13835

+-				bfq_entity_service_tree(entity),

13836

+-				entity);

13837

+ 		}

13838

+ 	}

13839

++	/* Update weight both if it must be raised and if it must be lowered */

13840

++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

13841

++		__bfq_entity_update_weight_prio(

13842

++			bfq_entity_service_tree(entity),

13843

++			entity);

13844

+ }

13845

+

13846

+ /*

13847

+@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)

13848

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

13849

+

13850

+ 	bic->ttime.last_end_request = jiffies;

13851

++	/*

13852

++	 * A newly created bic indicates that the process has just

13853

++	 * started doing I/O, and is probably mapping into memory its

13854

++	 * executable and libraries: it definitely needs weight raising.

13855

++	 * There is however the possibility that the process performs,

13856

++	 * for a while, I/O close to some other process. EQM intercepts

13857

++	 * this behavior and may merge the queue corresponding to the

13858

++	 * process  with some other queue, BEFORE the weight of the queue

13859

++	 * is raised. Merged queues are not weight-raised (they are assumed

13860

++	 * to belong to processes that benefit only from high throughput).

13861

++	 * If the merge is basically the consequence of an accident, then

13862

++	 * the queue will be split soon and will get back its old weight.

13863

++	 * It is then important to write down somewhere that this queue

13864

++	 * does need weight raising, even if it did not make it to get its

13865

++	 * weight raised before being merged. To this purpose, we overload

13866

++	 * the field raising_time_left and assign 1 to it, to mark the queue

13867

++	 * as needing weight raising.

13868

++	 */

13869

++	bic->raising_time_left = 1;

13870

+ }

13871

+

13872

+ static void bfq_exit_icq(struct io_cq *icq)

13873

+@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)

13874

+ 	}

13875

+

13876

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

13877

++		/*

13878

++		 * If the bic is using a shared queue, put the reference

13879

++		 * taken on the io_context when the bic started using a

13880

++		 * shared bfq_queue.

13881

++		 */

13882

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

13883

++			put_io_context(icq->ioc);

13884

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

13885

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

13886

+ 	}

13887

+@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

13888

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

13889

+ 		return;

13890

+

13891

++	/* Idle window just restored, statistics are meaningless. */

13892

++	if (bfq_bfqq_just_split(bfqq))

13893

++		return;

13894

++

13895

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

13896

+

13897

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

13898

+@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13899

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

13900

+ 	    !BFQQ_SEEKY(bfqq))

13901

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

13902

++	bfq_clear_bfqq_just_split(bfqq);

13903

+

13904

+ 	bfq_log_bfqq(bfqd, bfqq,

13905

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

13906

+@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13907

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

13908

+ {

13909

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

13910

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13911

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

13912

+

13913

+ 	assert_spin_locked(bfqd->queue->queue_lock);

13914

++

13915

++	/*

13916

++	 * An unplug may trigger a requeue of a request from the device

13917

++	 * driver: make sure we are in process context while trying to

13918

++	 * merge two bfq_queues.

13919

++	 */

13920

++	if (!in_interrupt()) {

13921

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

13922

++		if (new_bfqq != NULL) {

13923

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

13924

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

13925

++			/*

13926

++			 * Release the request's reference to the old bfqq

13927

++			 * and make sure one is taken to the shared queue.

13928

++			 */

13929

++			new_bfqq->allocated[rq_data_dir(rq)]++;

13930

++			bfqq->allocated[rq_data_dir(rq)]--;

13931

++			atomic_inc(&new_bfqq->ref);

13932

++			bfq_put_queue(bfqq);

13933

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

13934

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

13935

++						bfqq, new_bfqq);

13936

++			rq->elv.priv[1] = new_bfqq;

13937

++			bfqq = new_bfqq;

13938

++		}

13939

++	}

13940

++

13941

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

13942

+

13943

+ 	bfq_add_rq_rb(rq);

13944

+

13945

++	/*

13946

++	 * Here a newly-created bfq_queue has already started a weight-raising

13947

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

13948

++	 * from assigning it a full weight-raising period. See the detailed

13949

++	 * comments about this field in bfq_init_icq().

13950

++	 */

13951

++	if (bfqq->bic != NULL)

13952

++		bfqq->bic->raising_time_left = 0;

13953

+ 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

13954

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

13955

+

13956

+@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)

13957

+ 	}

13958

+ }

13959

+

13960

+-static struct bfq_queue *

13961

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13962

+-		struct bfq_queue *bfqq)

13963

+-{

13964

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13965

+-		(long unsigned)bfqq->new_bfqq->pid);

13966

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

13967

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

13968

+-	bfq_put_queue(bfqq);

13969

+-	return bic_to_bfqq(bic, 1);

13970

+-}

13971

+-

13972

+ /*

13973

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

13974

+  * was the last process referring to said bfqq.

13975

+@@ -2649,6 +2894,9 @@ static struct bfq_queue *

13976

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

13977

+ {

13978

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

13979

++

13980

++	put_io_context(bic->icq.ioc);

13981

++

13982

+ 	if (bfqq_process_refs(bfqq) == 1) {

13983

+ 		bfqq->pid = current->pid;

13984

+ 		bfq_clear_bfqq_coop(bfqq);

13985

+@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

13986

+ 	struct bfq_queue *bfqq;

13987

+ 	struct bfq_group *bfqg;

13988

+ 	unsigned long flags;

13989

++	bool split = false;

13990

+

13991

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

13992

+

13993

+@@ -2695,24 +2944,14 @@ new_queue:

13994

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

13995

+ 		bic_set_bfqq(bic, bfqq, is_sync);

13996

+ 	} else {

13997

+-		/*

13998

+-		 * If the queue was seeky for too long, break it apart.

13999

+-		 */

14000

++		/* If the queue was seeky for too long, break it apart. */

14001

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

14002

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

14003

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

14004

++			split = true;

14005

+ 			if (!bfqq)

14006

+ 				goto new_queue;

14007

+ 		}

14008

+-

14009

+-		/*

14010

+-		 * Check to see if this queue is scheduled to merge with

14011

+-		 * another closely cooperating queue. The merging of queues

14012

+-		 * happens here as it must be done in process context.

14013

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

14014

+-		 */

14015

+-		if (bfqq->new_bfqq != NULL)

14016

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

14017

+ 	}

14018

+

14019

+ 	bfqq->allocated[rw]++;

14020

+@@ -2723,6 +2962,26 @@ new_queue:

14021

+ 	rq->elv.priv[0] = bic;

14022

+ 	rq->elv.priv[1] = bfqq;

14023

+

14024

++	/*

14025

++	 * If a bfq_queue has only one process reference, it is owned

14026

++	 * by only one bfq_io_cq: we can set the bic field of the

14027

++	 * bfq_queue to the address of that structure. Also, if the

14028

++	 * queue has just been split, mark a flag so that the

14029

++	 * information is available to the other scheduler hooks.

14030

++	 */

14031

++	if (bfqq_process_refs(bfqq) == 1) {

14032

++		bfqq->bic = bic;

14033

++		if (split) {

14034

++			bfq_mark_bfqq_just_split(bfqq);

14035

++			/*

14036

++			 * If the queue has just been split from a shared queue,

14037

++			 * restore the idle window and the possible weight

14038

++			 * raising period.

14039

++			 */

14040

++			bfq_bfqq_resume_state(bfqq, bic);

14041

++		}

14042

++	}

14043

++

14044

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

14045

+

14046

+ 	return 0;

14047

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

14048

+index 30df81c..47e66a8 100644

14049

+--- a/block/bfq-sched.c

14050

++++ b/block/bfq-sched.c

14051

+@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

14052

+ 	return bfqq;

14053

+ }

14054

+

14055

+-/*

14056

+- * Forced extraction of the given queue.

14057

+- */

14058

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

14059

+-				      struct bfq_queue *bfqq)

14060

+-{

14061

+-	struct bfq_entity *entity;

14062

+-	struct bfq_sched_data *sd;

14063

+-

14064

+-	BUG_ON(bfqd->in_service_queue != NULL);

14065

+-

14066

+-	entity = &bfqq->entity;

14067

+-	/*

14068

+-	 * Bubble up extraction/update from the leaf to the root.

14069

+-	*/

14070

+-	for_each_entity(entity) {

14071

+-		sd = entity->sched_data;

14072

+-		bfq_update_budget(entity);

14073

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

14074

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

14075

+-		sd->active_entity = entity;

14076

+-		sd->next_active = NULL;

14077

+-		entity->service = 0;

14078

+-	}

14079

+-

14080

+-	return;

14081

+-}

14082

+-

14083

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

14084

+ {

14085

+ 	if (bfqd->in_service_bic != NULL) {

14086

+diff --git a/block/bfq.h b/block/bfq.h

14087

+index 68b28e3..438f560 100644

14088

+--- a/block/bfq.h

14089

++++ b/block/bfq.h

14090

+@@ -192,6 +192,8 @@ struct bfq_group;

14091

+  *                      idle to backlogged

14092

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

14093

+  *                           since the last transition from idle to backlogged

14094

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

14095

++ *	 queue is shared

14096

+  *

14097

+  * A bfq_queue is a leaf request queue; it can be associated to an io_context

14098

+  * or more (if it is an async one).  @cgroup holds a reference to the

14099

+@@ -235,6 +237,7 @@ struct bfq_queue {

14100

+ 	sector_t last_request_pos;

14101

+

14102

+ 	pid_t pid;

14103

++	struct bfq_io_cq *bic;

14104

+

14105

+ 	/* weight-raising fields */

14106

+ 	unsigned int raising_cur_max_time;

14107

+@@ -264,12 +267,23 @@ struct bfq_ttime {

14108

+  * @icq: associated io_cq structure

14109

+  * @bfqq: array of two process queues, the sync and the async

14110

+  * @ttime: associated @bfq_ttime struct

14111

++ * @raising_time_left: snapshot of the time left before weight raising ends

14112

++ *		       for the sync queue associated to this process; this

14113

++ *		       snapshot is taken to remember this value while the weight

14114

++ *		       raising is suspended because the queue is merged with a

14115

++ *		       shared queue, and is used to set @raising_cur_max_time

14116

++ *		       when the queue is split from the shared queue and its

14117

++ *		       weight is raised again

14118

++ * @saved_idle_window: same purpose as the previous field for the idle window

14119

+  */

14120

+ struct bfq_io_cq {

14121

+ 	struct io_cq icq; /* must be the first member */

14122

+ 	struct bfq_queue *bfqq[2];

14123

+ 	struct bfq_ttime ttime;

14124

+ 	int ioprio;

14125

++

14126

++	unsigned int raising_time_left;

14127

++	unsigned int saved_idle_window;

14128

+ };

14129

+

14130

+ /**

14131

+@@ -411,6 +425,7 @@ enum bfqq_state_flags {

14132

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

14133

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

14134

+ 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

14135

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

14136

+ 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

14137

+ };

14138

+

14139

+@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

14140

+ BFQ_BFQQ_FNS(budget_new);

14141

+ BFQ_BFQQ_FNS(coop);

14142

+ BFQ_BFQQ_FNS(split_coop);

14143

++BFQ_BFQQ_FNS(just_split);

14144

+ BFQ_BFQQ_FNS(softrt_update);

14145

+ #undef BFQ_BFQQ_FNS

14146

+

14147

+--

14148

+1.8.5.2

14149

+

14150

14151

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

14152

===================================================================

14153

--- genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch	2014-01-29 14:41:45 UTC (rev 2660)

14154

+++ genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch	2014-01-30 16:49:47 UTC (rev 2661)

14155

@@ -1,362 +0,0 @@

14156

-From 2e1646d06515b7dd1344db547dfcf9a4640dee8e Mon Sep 17 00:00:00 2001

14157

-From: Arianna Avanzini <avanzini.arianna@×××××.com>

14158

-Date: Wed, 11 Sep 2013 22:26:47 +0200

14159

-Subject: [PATCH] block: Switch from BFQ-v6r2 for 3.11.0 to BFQ-v6r2 for

14160

- 3.12.0-rc1

14161

-

14162

----

14163

- block/bfq-cgroup.c | 115 +++++++++++++++++++++++++++++++----------------------

14164

- block/bfq.h        |   2 +

14165

- 2 files changed, 70 insertions(+), 47 deletions(-)

14166

-

14167

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

14168

-index bb9b851..afae4ca 100644

14169

---- a/block/bfq-cgroup.c

14170

-+++ b/block/bfq-cgroup.c

14171

-@@ -16,9 +16,9 @@

14172

-

14173

- static DEFINE_MUTEX(bfqio_mutex);

14174

-

14175

--static bool bfqio_is_removed(struct cgroup *cgroup)

14176

-+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

14177

- {

14178

--	return test_bit(CGRP_DEAD, &cgroup->flags);

14179

-+	return bgrp ? !bgrp->online : false;

14180

- }

14181

-

14182

- static struct bfqio_cgroup bfqio_root_cgroup = {

14183

-@@ -38,10 +38,9 @@ static inline void bfq_init_entity(struct bfq_entity *entity,

14184

- 	entity->sched_data = &bfqg->sched_data;

14185

- }

14186

-

14187

--static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)

14188

-+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

14189

- {

14190

--	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),

14191

--			    struct bfqio_cgroup, css);

14192

-+	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

14193

- }

14194

-

14195

- /*

14196

-@@ -103,20 +102,20 @@ static inline void bfq_group_set_parent(struct bfq_group *bfqg,

14197

- /**

14198

-  * bfq_group_chain_alloc - allocate a chain of groups.

14199

-  * @bfqd: queue descriptor.

14200

-- * @cgroup: the leaf cgroup this chain starts from.

14201

-+ * @css: the leaf cgroup_subsys_state this chain starts from.

14202

-  *

14203

-  * Allocate a chain of groups starting from the one belonging to

14204

-  * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

14205

-  * to the root has already an allocated group on @bfqd.

14206

-  */

14207

- static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

14208

--					       struct cgroup *cgroup)

14209

-+					       struct cgroup_subsys_state *css)

14210

- {

14211

- 	struct bfqio_cgroup *bgrp;

14212

- 	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

14213

-

14214

--	for (; cgroup != NULL; cgroup = cgroup->parent) {

14215

--		bgrp = cgroup_to_bfqio(cgroup);

14216

-+	for (; css != NULL; css = css->parent) {

14217

-+		bgrp = css_to_bfqio(css);

14218

-

14219

- 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14220

- 		if (bfqg != NULL) {

14221

-@@ -165,7 +164,7 @@ cleanup:

14222

- /**

14223

-  * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

14224

-  * @bfqd: the queue descriptor.

14225

-- * @cgroup: the leaf cgroup to start from.

14226

-+ * @css: the leaf cgroup_subsys_state to start from.

14227

-  * @leaf: the leaf group (to be associated to @cgroup).

14228

-  *

14229

-  * Try to link a chain of groups to a cgroup hierarchy, connecting the

14230

-@@ -177,7 +176,8 @@ cleanup:

14231

-  * per device) while the bfqio_cgroup lock protects the list of groups

14232

-  * belonging to the same cgroup.

14233

-  */

14234

--static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14235

-+static void bfq_group_chain_link(struct bfq_data *bfqd,

14236

-+				 struct cgroup_subsys_state *css,

14237

- 				 struct bfq_group *leaf)

14238

- {

14239

- 	struct bfqio_cgroup *bgrp;

14240

-@@ -186,8 +186,8 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14241

-

14242

- 	assert_spin_locked(bfqd->queue->queue_lock);

14243

-

14244

--	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {

14245

--		bgrp = cgroup_to_bfqio(cgroup);

14246

-+	for (; css != NULL && leaf != NULL; css = css->parent) {

14247

-+		bgrp = css_to_bfqio(css);

14248

- 		next = leaf->bfqd;

14249

-

14250

- 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14251

-@@ -205,9 +205,9 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14252

- 		leaf = next;

14253

- 	}

14254

-

14255

--	BUG_ON(cgroup == NULL && leaf != NULL);

14256

--	if (cgroup != NULL && prev != NULL) {

14257

--		bgrp = cgroup_to_bfqio(cgroup);

14258

-+	BUG_ON(css == NULL && leaf != NULL);

14259

-+	if (css != NULL && prev != NULL) {

14260

-+		bgrp = css_to_bfqio(css);

14261

- 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14262

- 		bfq_group_set_parent(prev, bfqg);

14263

- 	}

14264

-@@ -233,18 +233,18 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14265

-  * have been successful.

14266

-  */

14267

- static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

14268

--					      struct cgroup *cgroup)

14269

-+					      struct cgroup_subsys_state *css)

14270

- {

14271

--	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

14272

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14273

- 	struct bfq_group *bfqg;

14274

-

14275

- 	bfqg = bfqio_lookup_group(bgrp, bfqd);

14276

- 	if (bfqg != NULL)

14277

- 		return bfqg;

14278

-

14279

--	bfqg = bfq_group_chain_alloc(bfqd, cgroup);

14280

-+	bfqg = bfq_group_chain_alloc(bfqd, css);

14281

- 	if (bfqg != NULL)

14282

--		bfq_group_chain_link(bfqd, cgroup, bfqg);

14283

-+		bfq_group_chain_link(bfqd, css, bfqg);

14284

- 	else

14285

- 		bfqg = bfqd->root_group;

14286

-

14287

-@@ -315,8 +315,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14288

-  * time here, at the price of slightly more complex code.

14289

-  */

14290

- static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14291

--						 struct bfq_io_cq *bic,

14292

--						 struct cgroup *cgroup)

14293

-+						struct bfq_io_cq *bic,

14294

-+						struct cgroup_subsys_state *css)

14295

- {

14296

- 	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

14297

- 	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

14298

-@@ -324,9 +324,9 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14299

- 	struct bfq_group *bfqg;

14300

- 	struct bfqio_cgroup *bgrp;

14301

-

14302

--	bgrp = cgroup_to_bfqio(cgroup);

14303

-+	bgrp = css_to_bfqio(css);

14304

-

14305

--	bfqg = bfq_find_alloc_group(bfqd, cgroup);

14306

-+	bfqg = bfq_find_alloc_group(bfqd, css);

14307

- 	if (async_bfqq != NULL) {

14308

- 		entity = &async_bfqq->entity;

14309

-

14310

-@@ -357,14 +357,14 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14311

-  * moved into its new parent group.

14312

-  */

14313

- static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

14314

--				  struct cgroup *cgroup)

14315

-+				  struct cgroup_subsys_state *css)

14316

- {

14317

- 	struct bfq_data *bfqd;

14318

- 	unsigned long uninitialized_var(flags);

14319

-

14320

- 	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

14321

- 	if (bfqd != NULL) {

14322

--		__bfq_bic_change_cgroup(bfqd, bic, cgroup);

14323

-+		__bfq_bic_change_cgroup(bfqd, bic, css);

14324

- 		bfq_put_bfqd_unlock(bfqd, &flags);

14325

- 	}

14326

- }

14327

-@@ -394,13 +394,13 @@ static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

14328

- {

14329

- 	struct bfq_data *bfqd = bic_to_bfqd(bic);

14330

- 	struct bfq_group *bfqg;

14331

--	struct cgroup *cgroup;

14332

-+	struct cgroup_subsys_state *css;

14333

-

14334

- 	BUG_ON(bfqd == NULL);

14335

-

14336

- 	rcu_read_lock();

14337

--	cgroup = task_cgroup(current, bfqio_subsys_id);

14338

--	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);

14339

-+	css = task_css(current, bfqio_subsys_id);

14340

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

14341

- 	rcu_read_unlock();

14342

-

14343

- 	return bfqg;

14344

-@@ -622,17 +622,16 @@ static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

14345

- }

14346

-

14347

- #define SHOW_FUNCTION(__VAR)						\

14348

--static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\

14349

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

14350

- 				       struct cftype *cftype)		\

14351

- {									\

14352

--	struct bfqio_cgroup *bgrp;					\

14353

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

14354

- 	u64 ret = -ENODEV;						\

14355

- 									\

14356

- 	mutex_lock(&bfqio_mutex);					\

14357

--	if (bfqio_is_removed(cgroup))					\

14358

-+	if (bfqio_is_removed(bgrp))					\

14359

- 		goto out_unlock;					\

14360

- 									\

14361

--	bgrp = cgroup_to_bfqio(cgroup);					\

14362

- 	spin_lock_irq(&bgrp->lock);					\

14363

- 	ret = bgrp->__VAR;						\

14364

- 	spin_unlock_irq(&bgrp->lock);					\

14365

-@@ -648,11 +647,11 @@ SHOW_FUNCTION(ioprio_class);

14366

- #undef SHOW_FUNCTION

14367

-

14368

- #define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

14369

--static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

14370

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

14371

- 					struct cftype *cftype,		\

14372

- 					u64 val)			\

14373

- {									\

14374

--	struct bfqio_cgroup *bgrp;					\

14375

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

14376

- 	struct bfq_group *bfqg;						\

14377

- 	int ret = -EINVAL;						\

14378

- 									\

14379

-@@ -661,12 +660,10 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

14380

- 									\

14381

- 	ret = -ENODEV;							\

14382

- 	mutex_lock(&bfqio_mutex);					\

14383

--	if (bfqio_is_removed(cgroup))					\

14384

-+	if (bfqio_is_removed(bgrp))					\

14385

- 		goto out_unlock;					\

14386

- 	ret = 0;							\

14387

- 									\

14388

--	bgrp = cgroup_to_bfqio(cgroup);					\

14389

--									\

14390

- 	spin_lock_irq(&bgrp->lock);					\

14391

- 	bgrp->__VAR = (unsigned short)val;				\

14392

- 	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

14393

-@@ -713,11 +710,11 @@ static struct cftype bfqio_files[] = {

14394

- 	{ },	/* terminate */

14395

- };

14396

-

14397

--static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

14398

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state *parent_css)

14399

- {

14400

- 	struct bfqio_cgroup *bgrp;

14401

-

14402

--	if (cgroup->parent != NULL) {

14403

-+	if (parent_css != NULL) {

14404

- 		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

14405

- 		if (bgrp == NULL)

14406

- 			return ERR_PTR(-ENOMEM);

14407

-@@ -740,13 +737,14 @@ static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

14408

-  * behavior is that a group containing a task that forked using CLONE_IO

14409

-  * will not be destroyed until the tasks sharing the ioc die.

14410

-  */

14411

--static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14412

-+static int bfqio_can_attach(struct cgroup_subsys_state *css,

14413

-+			    struct cgroup_taskset *tset)

14414

- {

14415

- 	struct task_struct *task;

14416

- 	struct io_context *ioc;

14417

- 	int ret = 0;

14418

-

14419

--	cgroup_taskset_for_each(task, cgroup, tset) {

14420

-+	cgroup_taskset_for_each(task, css, tset) {

14421

- 		/* task_lock() is needed to avoid races with exit_io_context() */

14422

- 		task_lock(task);

14423

- 		ioc = task->io_context;

14424

-@@ -766,7 +764,8 @@ static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14425

- 	return ret;

14426

- }

14427

-

14428

--static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14429

-+static void bfqio_attach(struct cgroup_subsys_state *css,

14430

-+			 struct cgroup_taskset *tset)

14431

- {

14432

- 	struct task_struct *task;

14433

- 	struct io_context *ioc;

14434

-@@ -776,7 +775,7 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14435

- 	 * IMPORTANT NOTE: The move of more than one process at a time to a

14436

- 	 * new group has not yet been tested.

14437

- 	 */

14438

--	cgroup_taskset_for_each(task, cgroup, tset) {

14439

-+	cgroup_taskset_for_each(task, css, tset) {

14440

- 		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

14441

- 		if (ioc) {

14442

- 			/*

14443

-@@ -787,16 +786,16 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14444

- 				if (!strncmp(icq->q->elevator->type->elevator_name,

14445

- 					     "bfq", ELV_NAME_MAX))

14446

- 					bfq_bic_change_cgroup(icq_to_bic(icq),

14447

--							      cgroup);

14448

-+							      css);

14449

- 			rcu_read_unlock();

14450

- 			put_io_context(ioc);

14451

- 		}

14452

- 	}

14453

- }

14454

-

14455

--static void bfqio_destroy(struct cgroup *cgroup)

14456

-+static void bfqio_destroy(struct cgroup_subsys_state *css)

14457

- {

14458

--	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

14459

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14460

- 	struct hlist_node *tmp;

14461

- 	struct bfq_group *bfqg;

14462

-

14463

-@@ -815,9 +814,31 @@ static void bfqio_destroy(struct cgroup *cgroup)

14464

- 	kfree(bgrp);

14465

- }

14466

-

14467

-+static int bfqio_css_online(struct cgroup_subsys_state *css)

14468

-+{

14469

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14470

-+

14471

-+	mutex_lock(&bfqio_mutex);

14472

-+	bgrp->online = true;

14473

-+	mutex_unlock(&bfqio_mutex);

14474

-+

14475

-+	return 0;

14476

-+}

14477

-+

14478

-+static void bfqio_css_offline(struct cgroup_subsys_state *css)

14479

-+{

14480

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14481

-+

14482

-+	mutex_lock(&bfqio_mutex);

14483

-+	bgrp->online = false;

14484

-+	mutex_unlock(&bfqio_mutex);

14485

-+}

14486

-+

14487

- struct cgroup_subsys bfqio_subsys = {

14488

- 	.name = "bfqio",

14489

- 	.css_alloc = bfqio_create,

14490

-+	.css_online = bfqio_css_online,

14491

-+	.css_offline = bfqio_css_offline,

14492

- 	.can_attach = bfqio_can_attach,

14493

- 	.attach = bfqio_attach,

14494

- 	.css_free = bfqio_destroy,

14495

-diff --git a/block/bfq.h b/block/bfq.h

14496

-index bb52975..885e62c 100644

14497

---- a/block/bfq.h

14498

-+++ b/block/bfq.h

14499

-@@ -510,6 +510,7 @@ struct bfq_group {

14500

- /**

14501

-  * struct bfqio_cgroup - bfq cgroup data structure.

14502

-  * @css: subsystem state for bfq in the containing cgroup.

14503

-+ * @online: flag marked when the subsystem is inserted.

14504

-  * @weight: cgroup weight.

14505

-  * @ioprio: cgroup ioprio.

14506

-  * @ioprio_class: cgroup ioprio_class.

14507

-@@ -521,6 +522,7 @@ struct bfq_group {

14508

-  */

14509

- struct bfqio_cgroup {

14510

- 	struct cgroup_subsys_state css;

14511

-+	bool online;

14512

-

14513

- 	unsigned short weight, ioprio, ioprio_class;

14514

-

14515

---

14516

-1.8.1.4

14517

-

Gentoo Archives: gentoo-commits