[gentoo-commits] linux-patches r2666 - in genpatches-2.6/trunk: 3.13 3.14 - gentoo-commits

From:	"Tom Wijsman (tomwij)" <tomwij@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] linux-patches r2666 - in genpatches-2.6/trunk: 3.13 3.14
Date:	Fri, 07 Feb 2014 15:42:40
Message-Id:	`20140207154235.ECCAA2004C@flycatcher.gentoo.org`

1

Author: tomwij

2

Date: 2014-02-07 15:42:35 +0000 (Fri, 07 Feb 2014)

3

New Revision: 2666

4

5

Added:

6

   genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

7

   genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

8

   genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

9

   genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

10

   genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

11

   genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

12

Removed:

13

   genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

14

   genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

15

   genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

16

Modified:

17

   genpatches-2.6/trunk/3.13/0000_README

18

   genpatches-2.6/trunk/3.14/0000_README

19

Log:

20

Updated experimental BFQ patches to new revision v7r1.

21

22

Modified: genpatches-2.6/trunk/3.13/0000_README

23

===================================================================

24

--- genpatches-2.6/trunk/3.13/0000_README	2014-02-07 14:46:59 UTC (rev 2665)

25

+++ genpatches-2.6/trunk/3.13/0000_README	2014-02-07 15:42:35 UTC (rev 2666)

26

@@ -91,17 +91,17 @@

27

 From:   Tom Wijsman <TomWij@g.o>

28

 Desc:   Add Gentoo Linux support config settings and defaults.

29

30

-Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch

31

+Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch

32

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

33

-Desc:   BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits

34

+Desc:   BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits

35

36

-Patch:  5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1

37

+Patch:  5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1

38

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

39

-Desc:   BFQ v7 patch 2 for 3.13: BFQ Scheduler

40

+Desc:   BFQ v7r1 patch 2 for 3.13: BFQ Scheduler

41

42

-Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch

43

+Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch

44

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

45

-Desc:   BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)

46

+Desc:   BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)

47

48

 Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

49

 From:   https://github.com/graysky2/kernel_gcc_patch/

50

51

Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

52

===================================================================

53

--- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	2014-02-07 14:46:59 UTC (rev 2665)

54

+++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	2014-02-07 15:42:35 UTC (rev 2666)

55

@@ -1,104 +0,0 @@

56

-From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001

57

-From: Paolo Valente <paolo.valente@×××××××.it>

58

-Date: Tue, 3 Sep 2013 16:50:42 +0200

59

-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13

60

-

61

-Update Kconfig.iosched and do the related Makefile changes to include

62

-kernel configuration options for BFQ. Also add the bfqio controller

63

-to the cgroups subsystem.

64

-

65

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

66

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

67

----

68

- block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

69

- block/Makefile                |  1 +

70

- include/linux/cgroup_subsys.h |  4 ++++

71

- 3 files changed, 37 insertions(+)

72

-

73

-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

74

-index 421bef9..8f552ba 100644

75

---- a/block/Kconfig.iosched

76

-+++ b/block/Kconfig.iosched

77

-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

78

- 	---help---

79

- 	  Enable group IO scheduling in CFQ.

80

-

81

-+config IOSCHED_BFQ

82

-+	tristate "BFQ I/O scheduler"

83

-+	default n

84

-+	---help---

85

-+	  The BFQ I/O scheduler tries to distribute bandwidth among

86

-+	  all processes according to their weights.

87

-+	  It aims at distributing the bandwidth as desired, independently of

88

-+	  the disk parameters and with any workload. It also tries to

89

-+	  guarantee low latency to interactive and soft real-time

90

-+	  applications.  If compiled built-in (saying Y here), BFQ can

91

-+	  be configured to support hierarchical scheduling.

92

-+

93

-+config CGROUP_BFQIO

94

-+	bool "BFQ hierarchical scheduling support"

95

-+	depends on CGROUPS && IOSCHED_BFQ=y

96

-+	default n

97

-+	---help---

98

-+	  Enable hierarchical scheduling in BFQ, using the cgroups

99

-+	  filesystem interface.  The name of the subsystem will be

100

-+	  bfqio.

101

-+

102

- choice

103

- 	prompt "Default I/O scheduler"

104

- 	default DEFAULT_CFQ

105

-@@ -52,6 +73,16 @@ choice

106

- 	config DEFAULT_CFQ

107

- 		bool "CFQ" if IOSCHED_CFQ=y

108

-

109

-+	config DEFAULT_BFQ

110

-+		bool "BFQ" if IOSCHED_BFQ=y

111

-+		help

112

-+		  Selects BFQ as the default I/O scheduler which will be

113

-+		  used by default for all block devices.

114

-+		  The BFQ I/O scheduler aims at distributing the bandwidth

115

-+		  as desired, independently of the disk parameters and with

116

-+		  any workload. It also tries to guarantee low latency to

117

-+		  interactive and soft real-time applications.

118

-+

119

- 	config DEFAULT_NOOP

120

- 		bool "No-op"

121

-

122

-@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

123

- 	string

124

- 	default "deadline" if DEFAULT_DEADLINE

125

- 	default "cfq" if DEFAULT_CFQ

126

-+	default "bfq" if DEFAULT_BFQ

127

- 	default "noop" if DEFAULT_NOOP

128

-

129

- endmenu

130

-diff --git a/block/Makefile b/block/Makefile

131

-index 20645e8..cbd83fb 100644

132

---- a/block/Makefile

133

-+++ b/block/Makefile

134

-@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

135

- obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

136

- obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

137

- obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

138

-+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

139

-

140

- obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

141

- obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

142

-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

143

-index b613ffd..43c5dc9 100644

144

---- a/include/linux/cgroup_subsys.h

145

-+++ b/include/linux/cgroup_subsys.h

146

-@@ -39,6 +39,10 @@ SUBSYS(net_cls)

147

- SUBSYS(blkio)

148

- #endif

149

-

150

-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

151

-+SUBSYS(bfqio)

152

-+#endif

153

-+

154

- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

155

- SUBSYS(perf)

156

- #endif

157

---

158

-1.8.5.2

159

-

160

161

Added: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

162

===================================================================

163

--- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	                        (rev 0)

164

+++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	2014-02-07 15:42:35 UTC (rev 2666)

165

@@ -0,0 +1,104 @@

166

+From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001

167

+From: Paolo Valente <paolo.valente@×××××××.it>

168

+Date: Tue, 3 Sep 2013 16:50:42 +0200

169

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13

170

+

171

+Update Kconfig.iosched and do the related Makefile changes to include

172

+kernel configuration options for BFQ. Also add the bfqio controller

173

+to the cgroups subsystem.

174

+

175

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

176

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

177

+---

178

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

179

+ block/Makefile                |  1 +

180

+ include/linux/cgroup_subsys.h |  4 ++++

181

+ 3 files changed, 37 insertions(+)

182

+

183

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

184

+index 421bef9..8f552ba 100644

185

+--- a/block/Kconfig.iosched

186

++++ b/block/Kconfig.iosched

187

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

188

+ 	---help---

189

+ 	  Enable group IO scheduling in CFQ.

190

+

191

++config IOSCHED_BFQ

192

++	tristate "BFQ I/O scheduler"

193

++	default n

194

++	---help---

195

++	  The BFQ I/O scheduler tries to distribute bandwidth among

196

++	  all processes according to their weights.

197

++	  It aims at distributing the bandwidth as desired, independently of

198

++	  the disk parameters and with any workload. It also tries to

199

++	  guarantee low latency to interactive and soft real-time

200

++	  applications.  If compiled built-in (saying Y here), BFQ can

201

++	  be configured to support hierarchical scheduling.

202

++

203

++config CGROUP_BFQIO

204

++	bool "BFQ hierarchical scheduling support"

205

++	depends on CGROUPS && IOSCHED_BFQ=y

206

++	default n

207

++	---help---

208

++	  Enable hierarchical scheduling in BFQ, using the cgroups

209

++	  filesystem interface.  The name of the subsystem will be

210

++	  bfqio.

211

++

212

+ choice

213

+ 	prompt "Default I/O scheduler"

214

+ 	default DEFAULT_CFQ

215

+@@ -52,6 +73,16 @@ choice

216

+ 	config DEFAULT_CFQ

217

+ 		bool "CFQ" if IOSCHED_CFQ=y

218

+

219

++	config DEFAULT_BFQ

220

++		bool "BFQ" if IOSCHED_BFQ=y

221

++		help

222

++		  Selects BFQ as the default I/O scheduler which will be

223

++		  used by default for all block devices.

224

++		  The BFQ I/O scheduler aims at distributing the bandwidth

225

++		  as desired, independently of the disk parameters and with

226

++		  any workload. It also tries to guarantee low latency to

227

++		  interactive and soft real-time applications.

228

++

229

+ 	config DEFAULT_NOOP

230

+ 		bool "No-op"

231

+

232

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

233

+ 	string

234

+ 	default "deadline" if DEFAULT_DEADLINE

235

+ 	default "cfq" if DEFAULT_CFQ

236

++	default "bfq" if DEFAULT_BFQ

237

+ 	default "noop" if DEFAULT_NOOP

238

+

239

+ endmenu

240

+diff --git a/block/Makefile b/block/Makefile

241

+index 20645e8..cbd83fb 100644

242

+--- a/block/Makefile

243

++++ b/block/Makefile

244

+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

245

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

246

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

247

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

248

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

249

+

250

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

251

+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

252

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

253

+index b613ffd..43c5dc9 100644

254

+--- a/include/linux/cgroup_subsys.h

255

++++ b/include/linux/cgroup_subsys.h

256

+@@ -39,6 +39,10 @@ SUBSYS(net_cls)

257

+ SUBSYS(blkio)

258

+ #endif

259

+

260

++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

261

++SUBSYS(bfqio)

262

++#endif

263

++

264

+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

265

+ SUBSYS(perf)

266

+ #endif

267

+--

268

+1.8.5.2

269

+

270

271

Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

272

===================================================================

273

--- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	2014-02-07 14:46:59 UTC (rev 2665)

274

+++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	2014-02-07 15:42:35 UTC (rev 2666)

275

@@ -1,6008 +0,0 @@

276

-From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001

277

-From: Paolo Valente <paolo.valente@×××××××.it>

278

-Date: Thu, 9 May 2013 19:10:02 +0200

279

-Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13

280

-

281

-Add the BFQ-v7 I/O scheduler to 3.13.

282

-The general structure is borrowed from CFQ, as much of the code for

283

-handling I/O contexts Over time, several useful features have been

284

-ported from CFQ as well (details in the changelog in README.BFQ). A

285

-(bfq_)queue is associated to each task doing I/O on a device, and each

286

-time a scheduling decision has to be made a queue is selected and served

287

-until it expires.

288

-

289

-    - Slices are given in the service domain: tasks are assigned

290

-      budgets, measured in number of sectors. Once got the disk, a task

291

-      must however consume its assigned budget within a configurable

292

-      maximum time (by default, the maximum possible value of the

293

-      budgets is automatically computed to comply with this timeout).

294

-      This allows the desired latency vs "throughput boosting" tradeoff

295

-      to be set.

296

-

297

-    - Budgets are scheduled according to a variant of WF2Q+, implemented

298

-      using an augmented rb-tree to take eligibility into account while

299

-      preserving an O(log N) overall complexity.

300

-

301

-    - A low-latency tunable is provided; if enabled, both interactive

302

-      and soft real-time applications are guaranteed a very low latency.

303

-

304

-    - Latency guarantees are preserved also in the presence of NCQ.

305

-

306

-    - Also with flash-based devices, a high throughput is achieved

307

-      while still preserving latency guarantees.

308

-

309

-    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

310

-      cooperating-queue-merging and the preemption mechanisms present

311

-      in CFQ. EQM is in fact a unified mechanism that tries to get a

312

-      sequential read pattern, and hence a high throughput, with any

313

-      set of processes performing interleaved I/O over a contiguous

314

-      sequence of sectors.

315

-

316

-    - BFQ supports full hierarchical scheduling, exporting a cgroups

317

-      interface.  Since each node has a full scheduler, each group can

318

-      be assigned its own weight.

319

-

320

-    - If the cgroups interface is not used, only I/O priorities can be

321

-      assigned to processes, with ioprio values mapped to weights

322

-      with the relation weight = IOPRIO_BE_NR - ioprio.

323

-

324

-    - ioprio classes are served in strict priority order, i.e., lower

325

-      priority queues are not served as long as there are higher

326

-      priority queues.  Among queues in the same class the bandwidth is

327

-      distributed in proportion to the weight of each queue. A very

328

-      thin extra bandwidth is however guaranteed to the Idle class, to

329

-      prevent it from starving.

330

-

331

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

332

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

333

----

334

- block/bfq-cgroup.c  |  910 ++++++++++++++

335

- block/bfq-ioc.c     |   36 +

336

- block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++

337

- block/bfq-sched.c   | 1077 +++++++++++++++++

338

- block/bfq.h         |  614 ++++++++++

339

- 5 files changed, 5905 insertions(+)

340

- create mode 100644 block/bfq-cgroup.c

341

- create mode 100644 block/bfq-ioc.c

342

- create mode 100644 block/bfq-iosched.c

343

- create mode 100644 block/bfq-sched.c

344

- create mode 100644 block/bfq.h

345

-

346

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

347

-new file mode 100644

348

-index 0000000..b889acf

349

---- /dev/null

350

-+++ b/block/bfq-cgroup.c

351

-@@ -0,0 +1,910 @@

352

-+/*

353

-+ * BFQ: CGROUPS support.

354

-+ *

355

-+ * Based on ideas and code from CFQ:

356

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

357

-+ *

358

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

359

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

360

-+ *

361

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

362

-+ *

363

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

364

-+ */

365

-+

366

-+#ifdef CONFIG_CGROUP_BFQIO

367

-+

368

-+static DEFINE_MUTEX(bfqio_mutex);

369

-+

370

-+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

371

-+{

372

-+	return bgrp ? !bgrp->online : false;

373

-+}

374

-+

375

-+static struct bfqio_cgroup bfqio_root_cgroup = {

376

-+	.weight = BFQ_DEFAULT_GRP_WEIGHT,

377

-+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

378

-+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

379

-+};

380

-+

381

-+static inline void bfq_init_entity(struct bfq_entity *entity,

382

-+				   struct bfq_group *bfqg)

383

-+{

384

-+	entity->weight = entity->new_weight;

385

-+	entity->orig_weight = entity->new_weight;

386

-+	entity->ioprio = entity->new_ioprio;

387

-+	entity->ioprio_class = entity->new_ioprio_class;

388

-+	entity->parent = bfqg->my_entity;

389

-+	entity->sched_data = &bfqg->sched_data;

390

-+}

391

-+

392

-+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

393

-+{

394

-+	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

395

-+}

396

-+

397

-+/*

398

-+ * Search the bfq_group for bfqd into the hash table (by now only a list)

399

-+ * of bgrp.  Must be called under rcu_read_lock().

400

-+ */

401

-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

402

-+					    struct bfq_data *bfqd)

403

-+{

404

-+	struct bfq_group *bfqg;

405

-+	void *key;

406

-+

407

-+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

408

-+		key = rcu_dereference(bfqg->bfqd);

409

-+		if (key == bfqd)

410

-+			return bfqg;

411

-+	}

412

-+

413

-+	return NULL;

414

-+}

415

-+

416

-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

417

-+					 struct bfq_group *bfqg)

418

-+{

419

-+	struct bfq_entity *entity = &bfqg->entity;

420

-+

421

-+	/*

422

-+	 * If the weight of the entity has never been set via the sysfs

423

-+	 * interface, then bgrp->weight == 0. In this case we initialize

424

-+	 * the weight from the current ioprio value. Otherwise, the group

425

-+	 * weight, if set, has priority over the ioprio value.

426

-+	 */

427

-+	if (bgrp->weight == 0) {

428

-+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

429

-+		entity->new_ioprio = bgrp->ioprio;

430

-+	} else {

431

-+		entity->new_weight = bgrp->weight;

432

-+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

433

-+	}

434

-+	entity->orig_weight = entity->weight = entity->new_weight;

435

-+	entity->ioprio = entity->new_ioprio;

436

-+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

437

-+	entity->my_sched_data = &bfqg->sched_data;

438

-+}

439

-+

440

-+static inline void bfq_group_set_parent(struct bfq_group *bfqg,

441

-+					struct bfq_group *parent)

442

-+{

443

-+	struct bfq_entity *entity;

444

-+

445

-+	BUG_ON(parent == NULL);

446

-+	BUG_ON(bfqg == NULL);

447

-+

448

-+	entity = &bfqg->entity;

449

-+	entity->parent = parent->my_entity;

450

-+	entity->sched_data = &parent->sched_data;

451

-+}

452

-+

453

-+/**

454

-+ * bfq_group_chain_alloc - allocate a chain of groups.

455

-+ * @bfqd: queue descriptor.

456

-+ * @css: the leaf cgroup_subsys_state this chain starts from.

457

-+ *

458

-+ * Allocate a chain of groups starting from the one belonging to

459

-+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

460

-+ * to the root has already an allocated group on @bfqd.

461

-+ */

462

-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

463

-+					       struct cgroup_subsys_state *css)

464

-+{

465

-+	struct bfqio_cgroup *bgrp;

466

-+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

467

-+

468

-+	for (; css != NULL; css = css->parent) {

469

-+		bgrp = css_to_bfqio(css);

470

-+

471

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

472

-+		if (bfqg != NULL) {

473

-+			/*

474

-+			 * All the cgroups in the path from there to the

475

-+			 * root must have a bfq_group for bfqd, so we don't

476

-+			 * need any more allocations.

477

-+			 */

478

-+			break;

479

-+		}

480

-+

481

-+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

482

-+		if (bfqg == NULL)

483

-+			goto cleanup;

484

-+

485

-+		bfq_group_init_entity(bgrp, bfqg);

486

-+		bfqg->my_entity = &bfqg->entity;

487

-+

488

-+		if (leaf == NULL) {

489

-+			leaf = bfqg;

490

-+			prev = leaf;

491

-+		} else {

492

-+			bfq_group_set_parent(prev, bfqg);

493

-+			/*

494

-+			 * Build a list of allocated nodes using the bfqd

495

-+			 * filed, that is still unused and will be initialized

496

-+			 * only after the node will be connected.

497

-+			 */

498

-+			prev->bfqd = bfqg;

499

-+			prev = bfqg;

500

-+		}

501

-+	}

502

-+

503

-+	return leaf;

504

-+

505

-+cleanup:

506

-+	while (leaf != NULL) {

507

-+		prev = leaf;

508

-+		leaf = leaf->bfqd;

509

-+		kfree(prev);

510

-+	}

511

-+

512

-+	return NULL;

513

-+}

514

-+

515

-+/**

516

-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

517

-+ * @bfqd: the queue descriptor.

518

-+ * @css: the leaf cgroup_subsys_state to start from.

519

-+ * @leaf: the leaf group (to be associated to @cgroup).

520

-+ *

521

-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the

522

-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the

523

-+ * hierarchy that already as a group associated to @bfqd all the nodes

524

-+ * in the path to the root cgroup have one too.

525

-+ *

526

-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy

527

-+ * per device) while the bfqio_cgroup lock protects the list of groups

528

-+ * belonging to the same cgroup.

529

-+ */

530

-+static void bfq_group_chain_link(struct bfq_data *bfqd,

531

-+				 struct cgroup_subsys_state *css,

532

-+				 struct bfq_group *leaf)

533

-+{

534

-+	struct bfqio_cgroup *bgrp;

535

-+	struct bfq_group *bfqg, *next, *prev = NULL;

536

-+	unsigned long flags;

537

-+

538

-+	assert_spin_locked(bfqd->queue->queue_lock);

539

-+

540

-+	for (; css != NULL && leaf != NULL; css = css->parent) {

541

-+		bgrp = css_to_bfqio(css);

542

-+		next = leaf->bfqd;

543

-+

544

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

545

-+		BUG_ON(bfqg != NULL);

546

-+

547

-+		spin_lock_irqsave(&bgrp->lock, flags);

548

-+

549

-+		rcu_assign_pointer(leaf->bfqd, bfqd);

550

-+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

551

-+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

552

-+

553

-+		spin_unlock_irqrestore(&bgrp->lock, flags);

554

-+

555

-+		prev = leaf;

556

-+		leaf = next;

557

-+	}

558

-+

559

-+	BUG_ON(css == NULL && leaf != NULL);

560

-+	if (css != NULL && prev != NULL) {

561

-+		bgrp = css_to_bfqio(css);

562

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

563

-+		bfq_group_set_parent(prev, bfqg);

564

-+	}

565

-+}

566

-+

567

-+/**

568

-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

569

-+ * @bfqd: queue descriptor.

570

-+ * @cgroup: cgroup being searched for.

571

-+ *

572

-+ * Return a group associated to @bfqd in @cgroup, allocating one if

573

-+ * necessary.  When a group is returned all the cgroups in the path

574

-+ * to the root have a group associated to @bfqd.

575

-+ *

576

-+ * If the allocation fails, return the root group: this breaks guarantees

577

-+ * but is a safe fallbak.  If this loss becames a problem it can be

578

-+ * mitigated using the equivalent weight (given by the product of the

579

-+ * weights of the groups in the path from @group to the root) in the

580

-+ * root scheduler.

581

-+ *

582

-+ * We allocate all the missing nodes in the path from the leaf cgroup

583

-+ * to the root and we connect the nodes only after all the allocations

584

-+ * have been successful.

585

-+ */

586

-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

587

-+					      struct cgroup_subsys_state *css)

588

-+{

589

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

590

-+	struct bfq_group *bfqg;

591

-+

592

-+	bfqg = bfqio_lookup_group(bgrp, bfqd);

593

-+	if (bfqg != NULL)

594

-+		return bfqg;

595

-+

596

-+	bfqg = bfq_group_chain_alloc(bfqd, css);

597

-+	if (bfqg != NULL)

598

-+		bfq_group_chain_link(bfqd, css, bfqg);

599

-+	else

600

-+		bfqg = bfqd->root_group;

601

-+

602

-+	return bfqg;

603

-+}

604

-+

605

-+/**

606

-+ * bfq_bfqq_move - migrate @bfqq to @bfqg.

607

-+ * @bfqd: queue descriptor.

608

-+ * @bfqq: the queue to move.

609

-+ * @entity: @bfqq's entity.

610

-+ * @bfqg: the group to move to.

611

-+ *

612

-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

613

-+ * it on the new one.  Avoid putting the entity on the old group idle tree.

614

-+ *

615

-+ * Must be called under the queue lock; the cgroup owning @bfqg must

616

-+ * not disappear (by now this just means that we are called under

617

-+ * rcu_read_lock()).

618

-+ */

619

-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

620

-+			  struct bfq_entity *entity, struct bfq_group *bfqg)

621

-+{

622

-+	int busy, resume;

623

-+

624

-+	busy = bfq_bfqq_busy(bfqq);

625

-+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

626

-+

627

-+	BUG_ON(resume && !entity->on_st);

628

-+	BUG_ON(busy && !resume && entity->on_st &&

629

-+	       bfqq != bfqd->in_service_queue);

630

-+

631

-+	if (busy) {

632

-+		BUG_ON(atomic_read(&bfqq->ref) < 2);

633

-+

634

-+		if (!resume)

635

-+			bfq_del_bfqq_busy(bfqd, bfqq, 0);

636

-+		else

637

-+			bfq_deactivate_bfqq(bfqd, bfqq, 0);

638

-+	} else if (entity->on_st)

639

-+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

640

-+

641

-+	/*

642

-+	 * Here we use a reference to bfqg.  We don't need a refcounter

643

-+	 * as the cgroup reference will not be dropped, so that its

644

-+	 * destroy() callback will not be invoked.

645

-+	 */

646

-+	entity->parent = bfqg->my_entity;

647

-+	entity->sched_data = &bfqg->sched_data;

648

-+

649

-+	if (busy && resume)

650

-+		bfq_activate_bfqq(bfqd, bfqq);

651

-+

652

-+	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

653

-+		bfq_schedule_dispatch(bfqd);

654

-+}

655

-+

656

-+/**

657

-+ * __bfq_bic_change_cgroup - move @bic to @cgroup.

658

-+ * @bfqd: the queue descriptor.

659

-+ * @bic: the bic to move.

660

-+ * @cgroup: the cgroup to move to.

661

-+ *

662

-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

663

-+ * has to make sure that the reference to cgroup is valid across the call.

664

-+ *

665

-+ * NOTE: an alternative approach might have been to store the current

666

-+ * cgroup in bfqq and getting a reference to it, reducing the lookup

667

-+ * time here, at the price of slightly more complex code.

668

-+ */

669

-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

670

-+						struct bfq_io_cq *bic,

671

-+						struct cgroup_subsys_state *css)

672

-+{

673

-+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

674

-+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

675

-+	struct bfq_entity *entity;

676

-+	struct bfq_group *bfqg;

677

-+	struct bfqio_cgroup *bgrp;

678

-+

679

-+	bgrp = css_to_bfqio(css);

680

-+

681

-+	bfqg = bfq_find_alloc_group(bfqd, css);

682

-+	if (async_bfqq != NULL) {

683

-+		entity = &async_bfqq->entity;

684

-+

685

-+		if (entity->sched_data != &bfqg->sched_data) {

686

-+			bic_set_bfqq(bic, NULL, 0);

687

-+			bfq_log_bfqq(bfqd, async_bfqq,

688

-+				     "bic_change_group: %p %d",

689

-+				     async_bfqq, atomic_read(&async_bfqq->ref));

690

-+			bfq_put_queue(async_bfqq);

691

-+		}

692

-+	}

693

-+

694

-+	if (sync_bfqq != NULL) {

695

-+		entity = &sync_bfqq->entity;

696

-+		if (entity->sched_data != &bfqg->sched_data)

697

-+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

698

-+	}

699

-+

700

-+	return bfqg;

701

-+}

702

-+

703

-+/**

704

-+ * bfq_bic_change_cgroup - move @bic to @cgroup.

705

-+ * @bic: the bic being migrated.

706

-+ * @cgroup: the destination cgroup.

707

-+ *

708

-+ * When the task owning @bic is moved to @cgroup, @bic is immediately

709

-+ * moved into its new parent group.

710

-+ */

711

-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

712

-+				  struct cgroup_subsys_state *css)

713

-+{

714

-+	struct bfq_data *bfqd;

715

-+	unsigned long uninitialized_var(flags);

716

-+

717

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

718

-+				   &flags);

719

-+	if (bfqd != NULL) {

720

-+		__bfq_bic_change_cgroup(bfqd, bic, css);

721

-+		bfq_put_bfqd_unlock(bfqd, &flags);

722

-+	}

723

-+}

724

-+

725

-+/**

726

-+ * bfq_bic_update_cgroup - update the cgroup of @bic.

727

-+ * @bic: the @bic to update.

728

-+ *

729

-+ * Make sure that @bic is enqueued in the cgroup of the current task.

730

-+ * We need this in addition to moving bics during the cgroup attach

731

-+ * phase because the task owning @bic could be at its first disk

732

-+ * access or we may end up in the root cgroup as the result of a

733

-+ * memory allocation failure and here we try to move to the right

734

-+ * group.

735

-+ *

736

-+ * Must be called under the queue lock.  It is safe to use the returned

737

-+ * value even after the rcu_read_unlock() as the migration/destruction

738

-+ * paths act under the queue lock too.  IOW it is impossible to race with

739

-+ * group migration/destruction and end up with an invalid group as:

740

-+ *   a) here cgroup has not yet been destroyed, nor its destroy callback

741

-+ *      has started execution, as current holds a reference to it,

742

-+ *   b) if it is destroyed after rcu_read_unlock() [after current is

743

-+ *      migrated to a different cgroup] its attach() callback will have

744

-+ *      taken care of remove all the references to the old cgroup data.

745

-+ */

746

-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

747

-+{

748

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

749

-+	struct bfq_group *bfqg;

750

-+	struct cgroup_subsys_state *css;

751

-+

752

-+	BUG_ON(bfqd == NULL);

753

-+

754

-+	rcu_read_lock();

755

-+	css = task_css(current, bfqio_subsys_id);

756

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

757

-+	rcu_read_unlock();

758

-+

759

-+	return bfqg;

760

-+}

761

-+

762

-+/**

763

-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

764

-+ * @st: the service tree being flushed.

765

-+ */

766

-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

767

-+{

768

-+	struct bfq_entity *entity = st->first_idle;

769

-+

770

-+	for (; entity != NULL; entity = st->first_idle)

771

-+		__bfq_deactivate_entity(entity, 0);

772

-+}

773

-+

774

-+/**

775

-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

776

-+ * @bfqd: the device data structure with the root group.

777

-+ * @entity: the entity to move.

778

-+ */

779

-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

780

-+					    struct bfq_entity *entity)

781

-+{

782

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

783

-+

784

-+	BUG_ON(bfqq == NULL);

785

-+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

786

-+	return;

787

-+}

788

-+

789

-+/**

790

-+ * bfq_reparent_active_entities - move to the root group all active entities.

791

-+ * @bfqd: the device data structure with the root group.

792

-+ * @bfqg: the group to move from.

793

-+ * @st: the service tree with the entities.

794

-+ *

795

-+ * Needs queue_lock to be taken and reference to be valid over the call.

796

-+ */

797

-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

798

-+						struct bfq_group *bfqg,

799

-+						struct bfq_service_tree *st)

800

-+{

801

-+	struct rb_root *active = &st->active;

802

-+	struct bfq_entity *entity = NULL;

803

-+

804

-+	if (!RB_EMPTY_ROOT(&st->active))

805

-+		entity = bfq_entity_of(rb_first(active));

806

-+

807

-+	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

808

-+		bfq_reparent_leaf_entity(bfqd, entity);

809

-+

810

-+	if (bfqg->sched_data.active_entity != NULL)

811

-+		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

812

-+

813

-+	return;

814

-+}

815

-+

816

-+/**

817

-+ * bfq_destroy_group - destroy @bfqg.

818

-+ * @bgrp: the bfqio_cgroup containing @bfqg.

819

-+ * @bfqg: the group being destroyed.

820

-+ *

821

-+ * Destroy @bfqg, making sure that it is not referenced from its parent.

822

-+ */

823

-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

824

-+{

825

-+	struct bfq_data *bfqd;

826

-+	struct bfq_service_tree *st;

827

-+	struct bfq_entity *entity = bfqg->my_entity;

828

-+	unsigned long uninitialized_var(flags);

829

-+	int i;

830

-+

831

-+	hlist_del(&bfqg->group_node);

832

-+

833

-+	/*

834

-+	 * Empty all service_trees belonging to this group before deactivating

835

-+	 * the group itself.

836

-+	 */

837

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

838

-+		st = bfqg->sched_data.service_tree + i;

839

-+

840

-+		/*

841

-+		 * The idle tree may still contain bfq_queues belonging

842

-+		 * to exited task because they never migrated to a different

843

-+		 * cgroup from the one being destroyed now.  Noone else

844

-+		 * can access them so it's safe to act without any lock.

845

-+		 */

846

-+		bfq_flush_idle_tree(st);

847

-+

848

-+		/*

849

-+		 * It may happen that some queues are still active

850

-+		 * (busy) upon group destruction (if the corresponding

851

-+		 * processes have been forced to terminate). We move

852

-+		 * all the leaf entities corresponding to these queues

853

-+		 * to the root_group.

854

-+		 * Also, it may happen that the group has an entity

855

-+		 * under service, which is disconnected from the active

856

-+		 * tree: it must be moved, too.

857

-+		 * There is no need to put the sync queues, as the

858

-+		 * scheduler has taken no reference.

859

-+		 */

860

-+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

861

-+		if (bfqd != NULL) {

862

-+			bfq_reparent_active_entities(bfqd, bfqg, st);

863

-+			bfq_put_bfqd_unlock(bfqd, &flags);

864

-+		}

865

-+		BUG_ON(!RB_EMPTY_ROOT(&st->active));

866

-+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

867

-+	}

868

-+	BUG_ON(bfqg->sched_data.next_active != NULL);

869

-+	BUG_ON(bfqg->sched_data.active_entity != NULL);

870

-+

871

-+	/*

872

-+	 * We may race with device destruction, take extra care when

873

-+	 * dereferencing bfqg->bfqd.

874

-+	 */

875

-+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

876

-+	if (bfqd != NULL) {

877

-+		hlist_del(&bfqg->bfqd_node);

878

-+		__bfq_deactivate_entity(entity, 0);

879

-+		bfq_put_async_queues(bfqd, bfqg);

880

-+		bfq_put_bfqd_unlock(bfqd, &flags);

881

-+	}

882

-+	BUG_ON(entity->tree != NULL);

883

-+

884

-+	/*

885

-+	 * No need to defer the kfree() to the end of the RCU grace

886

-+	 * period: we are called from the destroy() callback of our

887

-+	 * cgroup, so we can be sure that noone is a) still using

888

-+	 * this cgroup or b) doing lookups in it.

889

-+	 */

890

-+	kfree(bfqg);

891

-+}

892

-+

893

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

894

-+{

895

-+	struct hlist_node *tmp;

896

-+	struct bfq_group *bfqg;

897

-+

898

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

899

-+		bfq_end_raising_async_queues(bfqd, bfqg);

900

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

901

-+}

902

-+

903

-+/**

904

-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

905

-+ * @bfqd: the device descriptor being exited.

906

-+ *

907

-+ * When the device exits we just make sure that no lookup can return

908

-+ * the now unused group structures.  They will be deallocated on cgroup

909

-+ * destruction.

910

-+ */

911

-+static void bfq_disconnect_groups(struct bfq_data *bfqd)

912

-+{

913

-+	struct hlist_node *tmp;

914

-+	struct bfq_group *bfqg;

915

-+

916

-+	bfq_log(bfqd, "disconnect_groups beginning");

917

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

918

-+		hlist_del(&bfqg->bfqd_node);

919

-+

920

-+		__bfq_deactivate_entity(bfqg->my_entity, 0);

921

-+

922

-+		/*

923

-+		 * Don't remove from the group hash, just set an

924

-+		 * invalid key.  No lookups can race with the

925

-+		 * assignment as bfqd is being destroyed; this

926

-+		 * implies also that new elements cannot be added

927

-+		 * to the list.

928

-+		 */

929

-+		rcu_assign_pointer(bfqg->bfqd, NULL);

930

-+

931

-+		bfq_log(bfqd, "disconnect_groups: put async for group %p",

932

-+			bfqg);

933

-+		bfq_put_async_queues(bfqd, bfqg);

934

-+	}

935

-+}

936

-+

937

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

938

-+{

939

-+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

940

-+	struct bfq_group *bfqg = bfqd->root_group;

941

-+

942

-+	bfq_put_async_queues(bfqd, bfqg);

943

-+

944

-+	spin_lock_irq(&bgrp->lock);

945

-+	hlist_del_rcu(&bfqg->group_node);

946

-+	spin_unlock_irq(&bgrp->lock);

947

-+

948

-+	/*

949

-+	 * No need to synchronize_rcu() here: since the device is gone

950

-+	 * there cannot be any read-side access to its root_group.

951

-+	 */

952

-+	kfree(bfqg);

953

-+}

954

-+

955

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

956

-+{

957

-+	struct bfq_group *bfqg;

958

-+	struct bfqio_cgroup *bgrp;

959

-+	int i;

960

-+

961

-+	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

962

-+	if (bfqg == NULL)

963

-+		return NULL;

964

-+

965

-+	bfqg->entity.parent = NULL;

966

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

967

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

968

-+

969

-+	bgrp = &bfqio_root_cgroup;

970

-+	spin_lock_irq(&bgrp->lock);

971

-+	rcu_assign_pointer(bfqg->bfqd, bfqd);

972

-+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

973

-+	spin_unlock_irq(&bgrp->lock);

974

-+

975

-+	return bfqg;

976

-+}

977

-+

978

-+#define SHOW_FUNCTION(__VAR)						\

979

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

980

-+				       struct cftype *cftype)		\

981

-+{									\

982

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

983

-+	u64 ret = -ENODEV;						\

984

-+									\

985

-+	mutex_lock(&bfqio_mutex);					\

986

-+	if (bfqio_is_removed(bgrp))					\

987

-+		goto out_unlock;					\

988

-+									\

989

-+	spin_lock_irq(&bgrp->lock);					\

990

-+	ret = bgrp->__VAR;						\

991

-+	spin_unlock_irq(&bgrp->lock);					\

992

-+									\

993

-+out_unlock:								\

994

-+	mutex_unlock(&bfqio_mutex);					\

995

-+	return ret;							\

996

-+}

997

-+

998

-+SHOW_FUNCTION(weight);

999

-+SHOW_FUNCTION(ioprio);

1000

-+SHOW_FUNCTION(ioprio_class);

1001

-+#undef SHOW_FUNCTION

1002

-+

1003

-+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

1004

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

1005

-+					struct cftype *cftype,		\

1006

-+					u64 val)			\

1007

-+{									\

1008

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

1009

-+	struct bfq_group *bfqg;						\

1010

-+	int ret = -EINVAL;						\

1011

-+									\

1012

-+	if (val < (__MIN) || val > (__MAX))				\

1013

-+		return ret;						\

1014

-+									\

1015

-+	ret = -ENODEV;							\

1016

-+	mutex_lock(&bfqio_mutex);					\

1017

-+	if (bfqio_is_removed(bgrp))					\

1018

-+		goto out_unlock;					\

1019

-+	ret = 0;							\

1020

-+									\

1021

-+	spin_lock_irq(&bgrp->lock);					\

1022

-+	bgrp->__VAR = (unsigned short)val;				\

1023

-+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

1024

-+		/*							\

1025

-+		 * Setting the ioprio_changed flag of the entity        \

1026

-+		 * to 1 with new_##__VAR == ##__VAR would re-set        \

1027

-+		 * the value of the weight to its ioprio mapping.       \

1028

-+		 * Set the flag only if necessary.                      \

1029

-+		 */                                                     \

1030

-+		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

1031

-+			bfqg->entity.new_##__VAR = (unsigned short)val; \

1032

-+			smp_wmb();                                      \

1033

-+			bfqg->entity.ioprio_changed = 1;                \

1034

-+		}							\

1035

-+	}								\

1036

-+	spin_unlock_irq(&bgrp->lock);					\

1037

-+									\

1038

-+out_unlock:								\

1039

-+	mutex_unlock(&bfqio_mutex);					\

1040

-+	return ret;							\

1041

-+}

1042

-+

1043

-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

1044

-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

1045

-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

1046

-+#undef STORE_FUNCTION

1047

-+

1048

-+static struct cftype bfqio_files[] = {

1049

-+	{

1050

-+		.name = "weight",

1051

-+		.read_u64 = bfqio_cgroup_weight_read,

1052

-+		.write_u64 = bfqio_cgroup_weight_write,

1053

-+	},

1054

-+	{

1055

-+		.name = "ioprio",

1056

-+		.read_u64 = bfqio_cgroup_ioprio_read,

1057

-+		.write_u64 = bfqio_cgroup_ioprio_write,

1058

-+	},

1059

-+	{

1060

-+		.name = "ioprio_class",

1061

-+		.read_u64 = bfqio_cgroup_ioprio_class_read,

1062

-+		.write_u64 = bfqio_cgroup_ioprio_class_write,

1063

-+	},

1064

-+	{ },	/* terminate */

1065

-+};

1066

-+

1067

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

1068

-+						*parent_css)

1069

-+{

1070

-+	struct bfqio_cgroup *bgrp;

1071

-+

1072

-+	if (parent_css != NULL) {

1073

-+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

1074

-+		if (bgrp == NULL)

1075

-+			return ERR_PTR(-ENOMEM);

1076

-+	} else

1077

-+		bgrp = &bfqio_root_cgroup;

1078

-+

1079

-+	spin_lock_init(&bgrp->lock);

1080

-+	INIT_HLIST_HEAD(&bgrp->group_data);

1081

-+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

1082

-+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

1083

-+

1084

-+	return &bgrp->css;

1085

-+}

1086

-+

1087

-+/*

1088

-+ * We cannot support shared io contexts, as we have no means to support

1089

-+ * two tasks with the same ioc in two different groups without major rework

1090

-+ * of the main bic/bfqq data structures.  By now we allow a task to change

1091

-+ * its cgroup only if it's the only owner of its ioc; the drawback of this

1092

-+ * behavior is that a group containing a task that forked using CLONE_IO

1093

-+ * will not be destroyed until the tasks sharing the ioc die.

1094

-+ */

1095

-+static int bfqio_can_attach(struct cgroup_subsys_state *css,

1096

-+			    struct cgroup_taskset *tset)

1097

-+{

1098

-+	struct task_struct *task;

1099

-+	struct io_context *ioc;

1100

-+	int ret = 0;

1101

-+

1102

-+	cgroup_taskset_for_each(task, css, tset) {

1103

-+		/*

1104

-+		 * task_lock() is needed to avoid races with

1105

-+		 * exit_io_context()

1106

-+		 */

1107

-+		task_lock(task);

1108

-+		ioc = task->io_context;

1109

-+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1110

-+			/*

1111

-+			 * ioc == NULL means that the task is either too young

1112

-+			 * or exiting: if it has still no ioc the ioc can't be

1113

-+			 * shared, if the task is exiting the attach will fail

1114

-+			 * anyway, no matter what we return here.

1115

-+			 */

1116

-+			ret = -EINVAL;

1117

-+		task_unlock(task);

1118

-+		if (ret)

1119

-+			break;

1120

-+	}

1121

-+

1122

-+	return ret;

1123

-+}

1124

-+

1125

-+static void bfqio_attach(struct cgroup_subsys_state *css,

1126

-+			 struct cgroup_taskset *tset)

1127

-+{

1128

-+	struct task_struct *task;

1129

-+	struct io_context *ioc;

1130

-+	struct io_cq *icq;

1131

-+

1132

-+	/*

1133

-+	 * IMPORTANT NOTE: The move of more than one process at a time to a

1134

-+	 * new group has not yet been tested.

1135

-+	 */

1136

-+	cgroup_taskset_for_each(task, css, tset) {

1137

-+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1138

-+		if (ioc) {

1139

-+			/*

1140

-+			 * Handle cgroup change here.

1141

-+			 */

1142

-+			rcu_read_lock();

1143

-+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1144

-+				if (!strncmp(

1145

-+					icq->q->elevator->type->elevator_name,

1146

-+					"bfq", ELV_NAME_MAX))

1147

-+					bfq_bic_change_cgroup(icq_to_bic(icq),

1148

-+							      css);

1149

-+			rcu_read_unlock();

1150

-+			put_io_context(ioc);

1151

-+		}

1152

-+	}

1153

-+}

1154

-+

1155

-+static void bfqio_destroy(struct cgroup_subsys_state *css)

1156

-+{

1157

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1158

-+	struct hlist_node *tmp;

1159

-+	struct bfq_group *bfqg;

1160

-+

1161

-+	/*

1162

-+	 * Since we are destroying the cgroup, there are no more tasks

1163

-+	 * referencing it, and all the RCU grace periods that may have

1164

-+	 * referenced it are ended (as the destruction of the parent

1165

-+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1166

-+	 * anything else and we don't need any synchronization.

1167

-+	 */

1168

-+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1169

-+		bfq_destroy_group(bgrp, bfqg);

1170

-+

1171

-+	BUG_ON(!hlist_empty(&bgrp->group_data));

1172

-+

1173

-+	kfree(bgrp);

1174

-+}

1175

-+

1176

-+static int bfqio_css_online(struct cgroup_subsys_state *css)

1177

-+{

1178

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1179

-+

1180

-+	mutex_lock(&bfqio_mutex);

1181

-+	bgrp->online = true;

1182

-+	mutex_unlock(&bfqio_mutex);

1183

-+

1184

-+	return 0;

1185

-+}

1186

-+

1187

-+static void bfqio_css_offline(struct cgroup_subsys_state *css)

1188

-+{

1189

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1190

-+

1191

-+	mutex_lock(&bfqio_mutex);

1192

-+	bgrp->online = false;

1193

-+	mutex_unlock(&bfqio_mutex);

1194

-+}

1195

-+

1196

-+struct cgroup_subsys bfqio_subsys = {

1197

-+	.name = "bfqio",

1198

-+	.css_alloc = bfqio_create,

1199

-+	.css_online = bfqio_css_online,

1200

-+	.css_offline = bfqio_css_offline,

1201

-+	.can_attach = bfqio_can_attach,

1202

-+	.attach = bfqio_attach,

1203

-+	.css_free = bfqio_destroy,

1204

-+	.subsys_id = bfqio_subsys_id,

1205

-+	.base_cftypes = bfqio_files,

1206

-+};

1207

-+#else

1208

-+static inline void bfq_init_entity(struct bfq_entity *entity,

1209

-+				   struct bfq_group *bfqg)

1210

-+{

1211

-+	entity->weight = entity->new_weight;

1212

-+	entity->orig_weight = entity->new_weight;

1213

-+	entity->ioprio = entity->new_ioprio;

1214

-+	entity->ioprio_class = entity->new_ioprio_class;

1215

-+	entity->sched_data = &bfqg->sched_data;

1216

-+}

1217

-+

1218

-+static inline struct bfq_group *

1219

-+bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1220

-+{

1221

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

1222

-+	return bfqd->root_group;

1223

-+}

1224

-+

1225

-+static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1226

-+				 struct bfq_queue *bfqq,

1227

-+				 struct bfq_entity *entity,

1228

-+				 struct bfq_group *bfqg)

1229

-+{

1230

-+}

1231

-+

1232

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

1233

-+{

1234

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

1235

-+}

1236

-+

1237

-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1238

-+{

1239

-+	bfq_put_async_queues(bfqd, bfqd->root_group);

1240

-+}

1241

-+

1242

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

1243

-+{

1244

-+	kfree(bfqd->root_group);

1245

-+}

1246

-+

1247

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1248

-+{

1249

-+	struct bfq_group *bfqg;

1250

-+	int i;

1251

-+

1252

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1253

-+	if (bfqg == NULL)

1254

-+		return NULL;

1255

-+

1256

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1257

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1258

-+

1259

-+	return bfqg;

1260

-+}

1261

-+#endif

1262

-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1263

-new file mode 100644

1264

-index 0000000..7f6b000

1265

---- /dev/null

1266

-+++ b/block/bfq-ioc.c

1267

-@@ -0,0 +1,36 @@

1268

-+/*

1269

-+ * BFQ: I/O context handling.

1270

-+ *

1271

-+ * Based on ideas and code from CFQ:

1272

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1273

-+ *

1274

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1275

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1276

-+ *

1277

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1278

-+ */

1279

-+

1280

-+/**

1281

-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1282

-+ * @icq: the iocontext queue.

1283

-+ */

1284

-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1285

-+{

1286

-+	/* bic->icq is the first member, %NULL will convert to %NULL */

1287

-+	return container_of(icq, struct bfq_io_cq, icq);

1288

-+}

1289

-+

1290

-+/**

1291

-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1292

-+ * @bfqd: the lookup key.

1293

-+ * @ioc: the io_context of the process doing I/O.

1294

-+ *

1295

-+ * Queue lock must be held.

1296

-+ */

1297

-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1298

-+					       struct io_context *ioc)

1299

-+{

1300

-+	if (ioc)

1301

-+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1302

-+	return NULL;

1303

-+}

1304

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1305

-new file mode 100644

1306

-index 0000000..7670400

1307

---- /dev/null

1308

-+++ b/block/bfq-iosched.c

1309

-@@ -0,0 +1,3268 @@

1310

-+/*

1311

-+ * BFQ, or Budget Fair Queueing, disk scheduler.

1312

-+ *

1313

-+ * Based on ideas and code from CFQ:

1314

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1315

-+ *

1316

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1317

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1318

-+ *

1319

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1320

-+ *

1321

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

1322

-+ *

1323

-+ * BFQ is a proportional share disk scheduling algorithm based on the

1324

-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

1325

-+ * number of sectors, to tasks instead of time slices. The disk is not granted

1326

-+ * to the in-service task for a given time slice, but until it has exahusted

1327

-+ * its assigned budget. This change from the time to the service domain allows

1328

-+ * BFQ to distribute the disk bandwidth among tasks as desired, without any

1329

-+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

1330

-+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

1331

-+ * their budgets (more precisely BFQ schedules queues associated to tasks).

1332

-+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

1333

-+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

1334

-+ * latencies to interactive and soft real-time applications.

1335

-+ *

1336

-+ * BFQ is described in [1], where also a reference to the initial, more

1337

-+ * theoretical paper on BFQ can be found. The interested reader can find in

1338

-+ * the latter paper full details on the main algorithm as well as formulas of

1339

-+ * the guarantees, plus formal proofs of all the properties. With respect to

1340

-+ * the version of BFQ presented in these papers, this implementation adds a

1341

-+ * few more heuristics, such as the one that guarantees a low latency to soft

1342

-+ * real-time applications, and a hierarchical extension based on H-WF2Q+.

1343

-+ *

1344

-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1345

-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1346

-+ * complexity derives from the one introduced with EEVDF in [3].

1347

-+ *

1348

-+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1349

-+ *     with the BFQ Disk I/O Scheduler'',

1350

-+ *     Proceedings of the 5th Annual International Systems and Storage

1351

-+ *     Conference (SYSTOR '12), June 2012.

1352

-+ *

1353

-+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1354

-+ *

1355

-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1356

-+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1357

-+ *     Oct 1997.

1358

-+ *

1359

-+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1360

-+ *

1361

-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1362

-+ *     First: A Flexible and Accurate Mechanism for Proportional Share

1363

-+ *     Resource Allocation,'' technical report.

1364

-+ *

1365

-+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1366

-+ */

1367

-+#include <linux/module.h>

1368

-+#include <linux/slab.h>

1369

-+#include <linux/blkdev.h>

1370

-+#include <linux/cgroup.h>

1371

-+#include <linux/elevator.h>

1372

-+#include <linux/jiffies.h>

1373

-+#include <linux/rbtree.h>

1374

-+#include <linux/ioprio.h>

1375

-+#include "bfq.h"

1376

-+#include "blk.h"

1377

-+

1378

-+/* Max number of dispatches in one round of service. */

1379

-+static const int bfq_quantum = 4;

1380

-+

1381

-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1382

-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1383

-+

1384

-+/* Maximum backwards seek, in KiB. */

1385

-+static const int bfq_back_max = 16 * 1024;

1386

-+

1387

-+/* Penalty of a backwards seek, in number of sectors. */

1388

-+static const int bfq_back_penalty = 2;

1389

-+

1390

-+/* Idling period duration, in jiffies. */

1391

-+static int bfq_slice_idle = HZ / 125;

1392

-+

1393

-+/* Default maximum budget values, in sectors and number of requests. */

1394

-+static const int bfq_default_max_budget = 16 * 1024;

1395

-+static const int bfq_max_budget_async_rq = 4;

1396

-+

1397

-+/*

1398

-+ * Async to sync throughput distribution is controlled as follows:

1399

-+ * when an async request is served, the entity is charged the number

1400

-+ * of sectors of the request, multipled by the factor below

1401

-+ */

1402

-+static const int bfq_async_charge_factor = 10;

1403

-+

1404

-+/* Default timeout values, in jiffies, approximating CFQ defaults. */

1405

-+static const int bfq_timeout_sync = HZ / 8;

1406

-+static int bfq_timeout_async = HZ / 25;

1407

-+

1408

-+struct kmem_cache *bfq_pool;

1409

-+

1410

-+/* Below this threshold (in ms), we consider thinktime immediate. */

1411

-+#define BFQ_MIN_TT		2

1412

-+

1413

-+/* hw_tag detection: parallel requests threshold and min samples needed. */

1414

-+#define BFQ_HW_QUEUE_THRESHOLD	4

1415

-+#define BFQ_HW_QUEUE_SAMPLES	32

1416

-+

1417

-+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1418

-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1419

-+

1420

-+/* Min samples used for peak rate estimation (for autotuning). */

1421

-+#define BFQ_PEAK_RATE_SAMPLES	32

1422

-+

1423

-+/* Shift used for peak rate fixed precision calculations. */

1424

-+#define BFQ_RATE_SHIFT		16

1425

-+

1426

-+/*

1427

-+ * The duration of the weight raising for interactive applications is

1428

-+ * computed automatically (as default behaviour), using the following

1429

-+ * formula: duration = (R / r) * T, where r is the peak rate of the

1430

-+ * disk, and R and T are two reference parameters. In particular, R is

1431

-+ * the peak rate of a reference disk, and T is about the maximum time

1432

-+ * for starting popular large applications on that disk, under BFQ and

1433

-+ * while reading two files in parallel. Finally, BFQ uses two

1434

-+ * different pairs (R, T) depending on whether the disk is rotational

1435

-+ * or non-rotational.

1436

-+ */

1437

-+#define T_rot			(msecs_to_jiffies(5500))

1438

-+#define T_nonrot		(msecs_to_jiffies(2000))

1439

-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

1440

-+#define R_rot			17415

1441

-+#define R_nonrot		34791

1442

-+

1443

-+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1444

-+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1445

-+

1446

-+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1447

-+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1448

-+

1449

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1450

-+

1451

-+#include "bfq-ioc.c"

1452

-+#include "bfq-sched.c"

1453

-+#include "bfq-cgroup.c"

1454

-+

1455

-+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1456

-+				 IOPRIO_CLASS_IDLE)

1457

-+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1458

-+				 IOPRIO_CLASS_RT)

1459

-+

1460

-+#define bfq_sample_valid(samples)	((samples) > 80)

1461

-+

1462

-+/*

1463

-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1464

-+ * set (in which case it could also be a direct WRITE).

1465

-+ */

1466

-+static inline int bfq_bio_sync(struct bio *bio)

1467

-+{

1468

-+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1469

-+		return 1;

1470

-+

1471

-+	return 0;

1472

-+}

1473

-+

1474

-+/*

1475

-+ * Scheduler run of queue, if there are requests pending and no one in the

1476

-+ * driver that will restart queueing.

1477

-+ */

1478

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1479

-+{

1480

-+	if (bfqd->queued != 0) {

1481

-+		bfq_log(bfqd, "schedule dispatch");

1482

-+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

1483

-+	}

1484

-+}

1485

-+

1486

-+/*

1487

-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1488

-+ * We choose the request that is closesr to the head right now.  Distance

1489

-+ * behind the head is penalized and only allowed to a certain extent.

1490

-+ */

1491

-+static struct request *bfq_choose_req(struct bfq_data *bfqd,

1492

-+				      struct request *rq1,

1493

-+				      struct request *rq2,

1494

-+				      sector_t last)

1495

-+{

1496

-+	sector_t s1, s2, d1 = 0, d2 = 0;

1497

-+	unsigned long back_max;

1498

-+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1499

-+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1500

-+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1501

-+

1502

-+	if (rq1 == NULL || rq1 == rq2)

1503

-+		return rq2;

1504

-+	if (rq2 == NULL)

1505

-+		return rq1;

1506

-+

1507

-+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1508

-+		return rq1;

1509

-+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1510

-+		return rq2;

1511

-+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1512

-+		return rq1;

1513

-+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1514

-+		return rq2;

1515

-+

1516

-+	s1 = blk_rq_pos(rq1);

1517

-+	s2 = blk_rq_pos(rq2);

1518

-+

1519

-+	/*

1520

-+	 * By definition, 1KiB is 2 sectors.

1521

-+	 */

1522

-+	back_max = bfqd->bfq_back_max * 2;

1523

-+

1524

-+	/*

1525

-+	 * Strict one way elevator _except_ in the case where we allow

1526

-+	 * short backward seeks which are biased as twice the cost of a

1527

-+	 * similar forward seek.

1528

-+	 */

1529

-+	if (s1 >= last)

1530

-+		d1 = s1 - last;

1531

-+	else if (s1 + back_max >= last)

1532

-+		d1 = (last - s1) * bfqd->bfq_back_penalty;

1533

-+	else

1534

-+		wrap |= BFQ_RQ1_WRAP;

1535

-+

1536

-+	if (s2 >= last)

1537

-+		d2 = s2 - last;

1538

-+	else if (s2 + back_max >= last)

1539

-+		d2 = (last - s2) * bfqd->bfq_back_penalty;

1540

-+	else

1541

-+		wrap |= BFQ_RQ2_WRAP;

1542

-+

1543

-+	/* Found required data */

1544

-+

1545

-+	/*

1546

-+	 * By doing switch() on the bit mask "wrap" we avoid having to

1547

-+	 * check two variables for all permutations: --> faster!

1548

-+	 */

1549

-+	switch (wrap) {

1550

-+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1551

-+		if (d1 < d2)

1552

-+			return rq1;

1553

-+		else if (d2 < d1)

1554

-+			return rq2;

1555

-+		else {

1556

-+			if (s1 >= s2)

1557

-+				return rq1;

1558

-+			else

1559

-+				return rq2;

1560

-+		}

1561

-+

1562

-+	case BFQ_RQ2_WRAP:

1563

-+		return rq1;

1564

-+	case BFQ_RQ1_WRAP:

1565

-+		return rq2;

1566

-+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1567

-+	default:

1568

-+		/*

1569

-+		 * Since both rqs are wrapped,

1570

-+		 * start with the one that's further behind head

1571

-+		 * (--> only *one* back seek required),

1572

-+		 * since back seek takes more time than forward.

1573

-+		 */

1574

-+		if (s1 <= s2)

1575

-+			return rq1;

1576

-+		else

1577

-+			return rq2;

1578

-+	}

1579

-+}

1580

-+

1581

-+static struct bfq_queue *

1582

-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1583

-+		     sector_t sector, struct rb_node **ret_parent,

1584

-+		     struct rb_node ***rb_link)

1585

-+{

1586

-+	struct rb_node **p, *parent;

1587

-+	struct bfq_queue *bfqq = NULL;

1588

-+

1589

-+	parent = NULL;

1590

-+	p = &root->rb_node;

1591

-+	while (*p) {

1592

-+		struct rb_node **n;

1593

-+

1594

-+		parent = *p;

1595

-+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1596

-+

1597

-+		/*

1598

-+		 * Sort strictly based on sector. Smallest to the left,

1599

-+		 * largest to the right.

1600

-+		 */

1601

-+		if (sector > blk_rq_pos(bfqq->next_rq))

1602

-+			n = &(*p)->rb_right;

1603

-+		else if (sector < blk_rq_pos(bfqq->next_rq))

1604

-+			n = &(*p)->rb_left;

1605

-+		else

1606

-+			break;

1607

-+		p = n;

1608

-+		bfqq = NULL;

1609

-+	}

1610

-+

1611

-+	*ret_parent = parent;

1612

-+	if (rb_link)

1613

-+		*rb_link = p;

1614

-+

1615

-+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1616

-+		(long long unsigned)sector,

1617

-+		bfqq != NULL ? bfqq->pid : 0);

1618

-+

1619

-+	return bfqq;

1620

-+}

1621

-+

1622

-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1623

-+{

1624

-+	struct rb_node **p, *parent;

1625

-+	struct bfq_queue *__bfqq;

1626

-+

1627

-+	if (bfqq->pos_root != NULL) {

1628

-+		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1629

-+		bfqq->pos_root = NULL;

1630

-+	}

1631

-+

1632

-+	if (bfq_class_idle(bfqq))

1633

-+		return;

1634

-+	if (!bfqq->next_rq)

1635

-+		return;

1636

-+

1637

-+	bfqq->pos_root = &bfqd->rq_pos_tree;

1638

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1639

-+			blk_rq_pos(bfqq->next_rq), &parent, &p);

1640

-+	if (__bfqq == NULL) {

1641

-+		rb_link_node(&bfqq->pos_node, parent, p);

1642

-+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1643

-+	} else

1644

-+		bfqq->pos_root = NULL;

1645

-+}

1646

-+

1647

-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1648

-+					struct bfq_queue *bfqq,

1649

-+					struct request *last)

1650

-+{

1651

-+	struct rb_node *rbnext = rb_next(&last->rb_node);

1652

-+	struct rb_node *rbprev = rb_prev(&last->rb_node);

1653

-+	struct request *next = NULL, *prev = NULL;

1654

-+

1655

-+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1656

-+

1657

-+	if (rbprev != NULL)

1658

-+		prev = rb_entry_rq(rbprev);

1659

-+

1660

-+	if (rbnext != NULL)

1661

-+		next = rb_entry_rq(rbnext);

1662

-+	else {

1663

-+		rbnext = rb_first(&bfqq->sort_list);

1664

-+		if (rbnext && rbnext != &last->rb_node)

1665

-+			next = rb_entry_rq(rbnext);

1666

-+	}

1667

-+

1668

-+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1669

-+}

1670

-+

1671

-+static void bfq_del_rq_rb(struct request *rq)

1672

-+{

1673

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1674

-+	struct bfq_data *bfqd = bfqq->bfqd;

1675

-+	const int sync = rq_is_sync(rq);

1676

-+

1677

-+	BUG_ON(bfqq->queued[sync] == 0);

1678

-+	bfqq->queued[sync]--;

1679

-+	bfqd->queued--;

1680

-+

1681

-+	elv_rb_del(&bfqq->sort_list, rq);

1682

-+

1683

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

1684

-+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

1685

-+			bfq_del_bfqq_busy(bfqd, bfqq, 1);

1686

-+		/*

1687

-+		 * Remove queue from request-position tree as it is empty.

1688

-+		 */

1689

-+		if (bfqq->pos_root != NULL) {

1690

-+			rb_erase(&bfqq->pos_node, bfqq->pos_root);

1691

-+			bfqq->pos_root = NULL;

1692

-+		}

1693

-+	}

1694

-+}

1695

-+

1696

-+/* see the definition of bfq_async_charge_factor for details */

1697

-+static inline unsigned long bfq_serv_to_charge(struct request *rq,

1698

-+					       struct bfq_queue *bfqq)

1699

-+{

1700

-+	return blk_rq_sectors(rq) *

1701

-+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

1702

-+		bfq_async_charge_factor));

1703

-+}

1704

-+

1705

-+/**

1706

-+ * bfq_updated_next_req - update the queue after a new next_rq selection.

1707

-+ * @bfqd: the device data the queue belongs to.

1708

-+ * @bfqq: the queue to update.

1709

-+ *

1710

-+ * If the first request of a queue changes we make sure that the queue

1711

-+ * has enough budget to serve at least its first request (if the

1712

-+ * request has grown).  We do this because if the queue has not enough

1713

-+ * budget for its first request, it has to go through two dispatch

1714

-+ * rounds to actually get it dispatched.

1715

-+ */

1716

-+static void bfq_updated_next_req(struct bfq_data *bfqd,

1717

-+				 struct bfq_queue *bfqq)

1718

-+{

1719

-+	struct bfq_entity *entity = &bfqq->entity;

1720

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1721

-+	struct request *next_rq = bfqq->next_rq;

1722

-+	unsigned long new_budget;

1723

-+

1724

-+	if (next_rq == NULL)

1725

-+		return;

1726

-+

1727

-+	if (bfqq == bfqd->in_service_queue)

1728

-+		/*

1729

-+		 * In order not to break guarantees, budgets cannot be

1730

-+		 * changed after an entity has been selected.

1731

-+		 */

1732

-+		return;

1733

-+

1734

-+	BUG_ON(entity->tree != &st->active);

1735

-+	BUG_ON(entity == entity->sched_data->active_entity);

1736

-+

1737

-+	new_budget = max_t(unsigned long, bfqq->max_budget,

1738

-+			   bfq_serv_to_charge(next_rq, bfqq));

1739

-+	entity->budget = new_budget;

1740

-+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

1741

-+	bfq_activate_bfqq(bfqd, bfqq);

1742

-+}

1743

-+

1744

-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

1745

-+{

1746

-+	u64 dur;

1747

-+

1748

-+	if (bfqd->bfq_raising_max_time > 0)

1749

-+		return bfqd->bfq_raising_max_time;

1750

-+

1751

-+	dur = bfqd->RT_prod;

1752

-+	do_div(dur, bfqd->peak_rate);

1753

-+

1754

-+	return dur;

1755

-+}

1756

-+

1757

-+static void bfq_add_rq_rb(struct request *rq)

1758

-+{

1759

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1760

-+	struct bfq_entity *entity = &bfqq->entity;

1761

-+	struct bfq_data *bfqd = bfqq->bfqd;

1762

-+	struct request *next_rq, *prev;

1763

-+	unsigned long old_raising_coeff = bfqq->raising_coeff;

1764

-+	int idle_for_long_time = 0;

1765

-+

1766

-+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

1767

-+	bfqq->queued[rq_is_sync(rq)]++;

1768

-+	bfqd->queued++;

1769

-+

1770

-+	elv_rb_add(&bfqq->sort_list, rq);

1771

-+

1772

-+	/*

1773

-+	 * Check if this request is a better next-serve candidate.

1774

-+	 */

1775

-+	prev = bfqq->next_rq;

1776

-+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

1777

-+	BUG_ON(next_rq == NULL);

1778

-+	bfqq->next_rq = next_rq;

1779

-+

1780

-+	/*

1781

-+	 * Adjust priority tree position, if next_rq changes.

1782

-+	 */

1783

-+	if (prev != bfqq->next_rq)

1784

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

1785

-+

1786

-+	if (!bfq_bfqq_busy(bfqq)) {

1787

-+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

1788

-+			time_is_before_jiffies(bfqq->soft_rt_next_start);

1789

-+		idle_for_long_time = time_is_before_jiffies(

1790

-+			bfqq->budget_timeout +

1791

-+			bfqd->bfq_raising_min_idle_time);

1792

-+		entity->budget = max_t(unsigned long, bfqq->max_budget,

1793

-+				       bfq_serv_to_charge(next_rq, bfqq));

1794

-+

1795

-+		if (!bfqd->low_latency)

1796

-+			goto add_bfqq_busy;

1797

-+

1798

-+		/*

1799

-+		 * If the queue is not being boosted and has been idle

1800

-+		 * for enough time, start a weight-raising period

1801

-+		 */

1802

-+		if (old_raising_coeff == 1 &&

1803

-+		    (idle_for_long_time || soft_rt)) {

1804

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1805

-+			if (idle_for_long_time)

1806

-+				bfqq->raising_cur_max_time =

1807

-+					bfq_wrais_duration(bfqd);

1808

-+			else

1809

-+				bfqq->raising_cur_max_time =

1810

-+					bfqd->bfq_raising_rt_max_time;

1811

-+			bfq_log_bfqq(bfqd, bfqq,

1812

-+				     "wrais starting at %llu msec,"

1813

-+				     "rais_max_time %u",

1814

-+				     bfqq->last_rais_start_finish,

1815

-+				     jiffies_to_msecs(bfqq->

1816

-+					raising_cur_max_time));

1817

-+		} else if (old_raising_coeff > 1) {

1818

-+			if (idle_for_long_time)

1819

-+				bfqq->raising_cur_max_time =

1820

-+					bfq_wrais_duration(bfqd);

1821

-+			else if (bfqq->raising_cur_max_time ==

1822

-+				 bfqd->bfq_raising_rt_max_time &&

1823

-+				 !soft_rt) {

1824

-+				bfqq->raising_coeff = 1;

1825

-+				bfq_log_bfqq(bfqd, bfqq,

1826

-+					     "wrais ending at %llu msec,"

1827

-+					     "rais_max_time %u",

1828

-+					     bfqq->last_rais_start_finish,

1829

-+					     jiffies_to_msecs(bfqq->

1830

-+						raising_cur_max_time));

1831

-+			} else if ((bfqq->last_rais_start_finish +

1832

-+				    bfqq->raising_cur_max_time <

1833

-+				    jiffies + bfqd->bfq_raising_rt_max_time) &&

1834

-+				   soft_rt) {

1835

-+				/*

1836

-+				 *

1837

-+				 * The remaining weight-raising time is lower

1838

-+				 * than bfqd->bfq_raising_rt_max_time, which

1839

-+				 * means that the application is enjoying

1840

-+				 * weight raising either because deemed soft rt

1841

-+				 * in the near past, or because deemed

1842

-+				 * interactive a long ago. In both cases,

1843

-+				 * resetting now the current remaining weight-

1844

-+				 * raising time for the application to the

1845

-+				 * weight-raising duration for soft rt

1846

-+				 * applications would not cause any latency

1847

-+				 * increase for the application (as the new

1848

-+				 * duration would be higher than the remaining

1849

-+				 * time).

1850

-+				 *

1851

-+				 * In addition, the application is now meeting

1852

-+				 * the requirements for being deemed soft rt.

1853

-+				 * In the end we can correctly and safely

1854

-+				 * (re)charge the weight-raising duration for

1855

-+				 * the application with the weight-raising

1856

-+				 * duration for soft rt applications.

1857

-+				 *

1858

-+				 * In particular, doing this recharge now, i.e.,

1859

-+				 * before the weight-raising period for the

1860

-+				 * application finishes, reduces the probability

1861

-+				 * of the following negative scenario:

1862

-+				 * 1) the weight of a soft rt application is

1863

-+				 *    raised at startup (as for any newly

1864

-+				 *    created application),

1865

-+				 * 2) since the application is not interactive,

1866

-+				 *    at a certain time weight-raising is

1867

-+				 *    stopped for the application,

1868

-+				 * 3) at that time the application happens to

1869

-+				 *    still have pending requests, and hence

1870

-+				 *    is destined to not have a chance to be

1871

-+				 *    deemed soft rt before these requests are

1872

-+				 *    completed (see the comments to the

1873

-+				 *    function bfq_bfqq_softrt_next_start()

1874

-+				 *    for details on soft rt detection),

1875

-+				 * 4) these pending requests experience a high

1876

-+				 *    latency because the application is not

1877

-+				 *    weight-raised while they are pending.

1878

-+				 */

1879

-+				bfqq->last_rais_start_finish = jiffies;

1880

-+				bfqq->raising_cur_max_time =

1881

-+					bfqd->bfq_raising_rt_max_time;

1882

-+			}

1883

-+		}

1884

-+		if (old_raising_coeff != bfqq->raising_coeff)

1885

-+			entity->ioprio_changed = 1;

1886

-+add_bfqq_busy:

1887

-+		bfqq->last_idle_bklogged = jiffies;

1888

-+		bfqq->service_from_backlogged = 0;

1889

-+		bfq_clear_bfqq_softrt_update(bfqq);

1890

-+		bfq_add_bfqq_busy(bfqd, bfqq);

1891

-+	} else {

1892

-+		if (bfqd->low_latency && old_raising_coeff == 1 &&

1893

-+			!rq_is_sync(rq) &&

1894

-+			bfqq->last_rais_start_finish +

1895

-+			time_is_before_jiffies(

1896

-+				bfqd->bfq_raising_min_inter_arr_async)) {

1897

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1898

-+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

1899

-+

1900

-+			bfqd->raised_busy_queues++;

1901

-+			entity->ioprio_changed = 1;

1902

-+			bfq_log_bfqq(bfqd, bfqq,

1903

-+				     "non-idle wrais starting at %llu msec,"

1904

-+				     "rais_max_time %u",

1905

-+				     bfqq->last_rais_start_finish,

1906

-+				     jiffies_to_msecs(bfqq->

1907

-+					raising_cur_max_time));

1908

-+		}

1909

-+		bfq_updated_next_req(bfqd, bfqq);

1910

-+	}

1911

-+

1912

-+	if (bfqd->low_latency &&

1913

-+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

1914

-+		 idle_for_long_time))

1915

-+		bfqq->last_rais_start_finish = jiffies;

1916

-+}

1917

-+

1918

-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

1919

-+{

1920

-+	elv_rb_del(&bfqq->sort_list, rq);

1921

-+	bfqq->queued[rq_is_sync(rq)]--;

1922

-+	bfqq->bfqd->queued--;

1923

-+	bfq_add_rq_rb(rq);

1924

-+}

1925

-+

1926

-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

1927

-+					  struct bio *bio)

1928

-+{

1929

-+	struct task_struct *tsk = current;

1930

-+	struct bfq_io_cq *bic;

1931

-+	struct bfq_queue *bfqq;

1932

-+

1933

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

1934

-+	if (bic == NULL)

1935

-+		return NULL;

1936

-+

1937

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1938

-+	if (bfqq != NULL)

1939

-+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

1940

-+

1941

-+	return NULL;

1942

-+}

1943

-+

1944

-+static void bfq_activate_request(struct request_queue *q, struct request *rq)

1945

-+{

1946

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1947

-+

1948

-+	bfqd->rq_in_driver++;

1949

-+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1950

-+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

1951

-+		(long long unsigned)bfqd->last_position);

1952

-+}

1953

-+

1954

-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

1955

-+{

1956

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1957

-+

1958

-+	WARN_ON(bfqd->rq_in_driver == 0);

1959

-+	bfqd->rq_in_driver--;

1960

-+}

1961

-+

1962

-+static void bfq_remove_request(struct request *rq)

1963

-+{

1964

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1965

-+	struct bfq_data *bfqd = bfqq->bfqd;

1966

-+

1967

-+	if (bfqq->next_rq == rq) {

1968

-+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

1969

-+		bfq_updated_next_req(bfqd, bfqq);

1970

-+	}

1971

-+

1972

-+	list_del_init(&rq->queuelist);

1973

-+	bfq_del_rq_rb(rq);

1974

-+

1975

-+	if (rq->cmd_flags & REQ_META) {

1976

-+		WARN_ON(bfqq->meta_pending == 0);

1977

-+		bfqq->meta_pending--;

1978

-+	}

1979

-+}

1980

-+

1981

-+static int bfq_merge(struct request_queue *q, struct request **req,

1982

-+		     struct bio *bio)

1983

-+{

1984

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1985

-+	struct request *__rq;

1986

-+

1987

-+	__rq = bfq_find_rq_fmerge(bfqd, bio);

1988

-+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

1989

-+		*req = __rq;

1990

-+		return ELEVATOR_FRONT_MERGE;

1991

-+	}

1992

-+

1993

-+	return ELEVATOR_NO_MERGE;

1994

-+}

1995

-+

1996

-+static void bfq_merged_request(struct request_queue *q, struct request *req,

1997

-+			       int type)

1998

-+{

1999

-+	if (type == ELEVATOR_FRONT_MERGE) {

2000

-+		struct bfq_queue *bfqq = RQ_BFQQ(req);

2001

-+

2002

-+		bfq_reposition_rq_rb(bfqq, req);

2003

-+	}

2004

-+}

2005

-+

2006

-+static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2007

-+				struct request *next)

2008

-+{

2009

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2010

-+

2011

-+	/*

2012

-+	 * Reposition in fifo if next is older than rq.

2013

-+	 */

2014

-+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2015

-+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

2016

-+		list_move(&rq->queuelist, &next->queuelist);

2017

-+		rq_set_fifo_time(rq, rq_fifo_time(next));

2018

-+	}

2019

-+

2020

-+	if (bfqq->next_rq == next)

2021

-+		bfqq->next_rq = rq;

2022

-+

2023

-+	bfq_remove_request(next);

2024

-+}

2025

-+

2026

-+/* Must be called with bfqq != NULL */

2027

-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

2028

-+{

2029

-+	BUG_ON(bfqq == NULL);

2030

-+	if (bfq_bfqq_busy(bfqq))

2031

-+		bfqq->bfqd->raised_busy_queues--;

2032

-+	bfqq->raising_coeff = 1;

2033

-+	bfqq->raising_cur_max_time = 0;

2034

-+	/* Trigger a weight change on the next activation of the queue */

2035

-+	bfqq->entity.ioprio_changed = 1;

2036

-+}

2037

-+

2038

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

2039

-+					struct bfq_group *bfqg)

2040

-+{

2041

-+	int i, j;

2042

-+

2043

-+	for (i = 0; i < 2; i++)

2044

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

2045

-+			if (bfqg->async_bfqq[i][j] != NULL)

2046

-+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

2047

-+	if (bfqg->async_idle_bfqq != NULL)

2048

-+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

2049

-+}

2050

-+

2051

-+static void bfq_end_raising(struct bfq_data *bfqd)

2052

-+{

2053

-+	struct bfq_queue *bfqq;

2054

-+

2055

-+	spin_lock_irq(bfqd->queue->queue_lock);

2056

-+

2057

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2058

-+		bfq_bfqq_end_raising(bfqq);

2059

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2060

-+		bfq_bfqq_end_raising(bfqq);

2061

-+	bfq_end_raising_async(bfqd);

2062

-+

2063

-+	spin_unlock_irq(bfqd->queue->queue_lock);

2064

-+}

2065

-+

2066

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2067

-+			   struct bio *bio)

2068

-+{

2069

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

2070

-+	struct bfq_io_cq *bic;

2071

-+	struct bfq_queue *bfqq;

2072

-+

2073

-+	/*

2074

-+	 * Disallow merge of a sync bio into an async request.

2075

-+	 */

2076

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2077

-+		return 0;

2078

-+

2079

-+	/*

2080

-+	 * Lookup the bfqq that this bio will be queued with. Allow

2081

-+	 * merge only if rq is queued there.

2082

-+	 * Queue lock is held here.

2083

-+	 */

2084

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

2085

-+	if (bic == NULL)

2086

-+		return 0;

2087

-+

2088

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2089

-+	return bfqq == RQ_BFQQ(rq);

2090

-+}

2091

-+

2092

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2093

-+				       struct bfq_queue *bfqq)

2094

-+{

2095

-+	if (bfqq != NULL) {

2096

-+		bfq_mark_bfqq_must_alloc(bfqq);

2097

-+		bfq_mark_bfqq_budget_new(bfqq);

2098

-+		bfq_clear_bfqq_fifo_expire(bfqq);

2099

-+

2100

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2101

-+

2102

-+		bfq_log_bfqq(bfqd, bfqq,

2103

-+			     "set_in_service_queue, cur-budget = %lu",

2104

-+			     bfqq->entity.budget);

2105

-+	}

2106

-+

2107

-+	bfqd->in_service_queue = bfqq;

2108

-+}

2109

-+

2110

-+/*

2111

-+ * Get and set a new queue for service.

2112

-+ */

2113

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2114

-+						  struct bfq_queue *bfqq)

2115

-+{

2116

-+	if (!bfqq)

2117

-+		bfqq = bfq_get_next_queue(bfqd);

2118

-+	else

2119

-+		bfq_get_next_queue_forced(bfqd, bfqq);

2120

-+

2121

-+	__bfq_set_in_service_queue(bfqd, bfqq);

2122

-+	return bfqq;

2123

-+}

2124

-+

2125

-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2126

-+					  struct request *rq)

2127

-+{

2128

-+	if (blk_rq_pos(rq) >= bfqd->last_position)

2129

-+		return blk_rq_pos(rq) - bfqd->last_position;

2130

-+	else

2131

-+		return bfqd->last_position - blk_rq_pos(rq);

2132

-+}

2133

-+

2134

-+/*

2135

-+ * Return true if bfqq has no request pending and rq is close enough to

2136

-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2137

-+ * bfqq->next_rq

2138

-+ */

2139

-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2140

-+{

2141

-+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2142

-+}

2143

-+

2144

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2145

-+{

2146

-+	struct rb_root *root = &bfqd->rq_pos_tree;

2147

-+	struct rb_node *parent, *node;

2148

-+	struct bfq_queue *__bfqq;

2149

-+	sector_t sector = bfqd->last_position;

2150

-+

2151

-+	if (RB_EMPTY_ROOT(root))

2152

-+		return NULL;

2153

-+

2154

-+	/*

2155

-+	 * First, if we find a request starting at the end of the last

2156

-+	 * request, choose it.

2157

-+	 */

2158

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2159

-+	if (__bfqq != NULL)

2160

-+		return __bfqq;

2161

-+

2162

-+	/*

2163

-+	 * If the exact sector wasn't found, the parent of the NULL leaf

2164

-+	 * will contain the closest sector (rq_pos_tree sorted by next_request

2165

-+	 * position).

2166

-+	 */

2167

-+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2168

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2169

-+		return __bfqq;

2170

-+

2171

-+	if (blk_rq_pos(__bfqq->next_rq) < sector)

2172

-+		node = rb_next(&__bfqq->pos_node);

2173

-+	else

2174

-+		node = rb_prev(&__bfqq->pos_node);

2175

-+	if (node == NULL)

2176

-+		return NULL;

2177

-+

2178

-+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2179

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2180

-+		return __bfqq;

2181

-+

2182

-+	return NULL;

2183

-+}

2184

-+

2185

-+/*

2186

-+ * bfqd - obvious

2187

-+ * cur_bfqq - passed in so that we don't decide that the current queue

2188

-+ *            is closely cooperating with itself.

2189

-+ *

2190

-+ * We are assuming that cur_bfqq has dispatched at least one request,

2191

-+ * and that bfqd->last_position reflects a position on the disk associated

2192

-+ * with the I/O issued by cur_bfqq.

2193

-+ */

2194

-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2195

-+					      struct bfq_queue *cur_bfqq)

2196

-+{

2197

-+	struct bfq_queue *bfqq;

2198

-+

2199

-+	if (bfq_class_idle(cur_bfqq))

2200

-+		return NULL;

2201

-+	if (!bfq_bfqq_sync(cur_bfqq))

2202

-+		return NULL;

2203

-+	if (BFQQ_SEEKY(cur_bfqq))

2204

-+		return NULL;

2205

-+

2206

-+	/* If device has only one backlogged bfq_queue, don't search. */

2207

-+	if (bfqd->busy_queues == 1)

2208

-+		return NULL;

2209

-+

2210

-+	/*

2211

-+	 * We should notice if some of the queues are cooperating, e.g.

2212

-+	 * working closely on the same area of the disk. In that case,

2213

-+	 * we can group them together and don't waste time idling.

2214

-+	 */

2215

-+	bfqq = bfqq_close(bfqd);

2216

-+	if (bfqq == NULL || bfqq == cur_bfqq)

2217

-+		return NULL;

2218

-+

2219

-+	/*

2220

-+	 * Do not merge queues from different bfq_groups.

2221

-+	*/

2222

-+	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2223

-+		return NULL;

2224

-+

2225

-+	/*

2226

-+	 * It only makes sense to merge sync queues.

2227

-+	 */

2228

-+	if (!bfq_bfqq_sync(bfqq))

2229

-+		return NULL;

2230

-+	if (BFQQ_SEEKY(bfqq))

2231

-+		return NULL;

2232

-+

2233

-+	/*

2234

-+	 * Do not merge queues of different priority classes.

2235

-+	 */

2236

-+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2237

-+		return NULL;

2238

-+

2239

-+	return bfqq;

2240

-+}

2241

-+

2242

-+/*

2243

-+ * If enough samples have been computed, return the current max budget

2244

-+ * stored in bfqd, which is dynamically updated according to the

2245

-+ * estimated disk peak rate; otherwise return the default max budget

2246

-+ */

2247

-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2248

-+{

2249

-+	if (bfqd->budgets_assigned < 194)

2250

-+		return bfq_default_max_budget;

2251

-+	else

2252

-+		return bfqd->bfq_max_budget;

2253

-+}

2254

-+

2255

-+/*

2256

-+ * Return min budget, which is a fraction of the current or default

2257

-+ * max budget (trying with 1/32)

2258

-+ */

2259

-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2260

-+{

2261

-+	if (bfqd->budgets_assigned < 194)

2262

-+		return bfq_default_max_budget / 32;

2263

-+	else

2264

-+		return bfqd->bfq_max_budget / 32;

2265

-+}

2266

-+

2267

-+/*

2268

-+ * Decides whether idling should be done for given device and

2269

-+ * given in-service queue.

2270

-+ */

2271

-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

2272

-+					   struct bfq_queue *in_service_bfqq)

2273

-+{

2274

-+	if (in_service_bfqq == NULL)

2275

-+		return false;

2276

-+	/*

2277

-+	 * If device is SSD it has no seek penalty, disable idling; but

2278

-+	 * do so only if:

2279

-+	 * - device does not support queuing, otherwise we still have

2280

-+	 *   a problem with sync vs async workloads;

2281

-+	 * - the queue is not weight-raised, to preserve guarantees.

2282

-+	 */

2283

-+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

2284

-+		in_service_bfqq->raising_coeff == 1);

2285

-+}

2286

-+

2287

-+static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2288

-+{

2289

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

2290

-+	struct bfq_io_cq *bic;

2291

-+	unsigned long sl;

2292

-+

2293

-+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2294

-+

2295

-+	/* Tasks have exited, don't wait. */

2296

-+	bic = bfqd->in_service_bic;

2297

-+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2298

-+		return;

2299

-+

2300

-+	bfq_mark_bfqq_wait_request(bfqq);

2301

-+

2302

-+	/*

2303

-+	 * We don't want to idle for seeks, but we do want to allow

2304

-+	 * fair distribution of slice time for a process doing back-to-back

2305

-+	 * seeks. So allow a little bit of time for him to submit a new rq.

2306

-+	 *

2307

-+	 * To prevent processes with (partly) seeky workloads from

2308

-+	 * being too ill-treated, grant them a small fraction of the

2309

-+	 * assigned budget before reducing the waiting time to

2310

-+	 * BFQ_MIN_TT. This happened to help reduce latency.

2311

-+	 */

2312

-+	sl = bfqd->bfq_slice_idle;

2313

-+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

2314

-+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

2315

-+	    bfqq->raising_coeff == 1)

2316

-+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2317

-+	else if (bfqq->raising_coeff > 1)

2318

-+		sl = sl * 3;

2319

-+	bfqd->last_idling_start = ktime_get();

2320

-+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2321

-+	bfq_log(bfqd, "arm idle: %u/%u ms",

2322

-+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2323

-+}

2324

-+

2325

-+/*

2326

-+ * Set the maximum time for the in-service queue to consume its

2327

-+ * budget. This prevents seeky processes from lowering the disk

2328

-+ * throughput (always guaranteed with a time slice scheme as in CFQ).

2329

-+ */

2330

-+static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2331

-+{

2332

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

2333

-+	unsigned int timeout_coeff;

2334

-+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

2335

-+		timeout_coeff = 1;

2336

-+	else

2337

-+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2338

-+

2339

-+	bfqd->last_budget_start = ktime_get();

2340

-+

2341

-+	bfq_clear_bfqq_budget_new(bfqq);

2342

-+	bfqq->budget_timeout = jiffies +

2343

-+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2344

-+

2345

-+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2346

-+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2347

-+		timeout_coeff));

2348

-+}

2349

-+

2350

-+/*

2351

-+ * Move request from internal lists to the request queue dispatch list.

2352

-+ */

2353

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2354

-+{

2355

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

2356

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2357

-+

2358

-+	bfq_remove_request(rq);

2359

-+	bfqq->dispatched++;

2360

-+	elv_dispatch_sort(q, rq);

2361

-+

2362

-+	if (bfq_bfqq_sync(bfqq))

2363

-+		bfqd->sync_flight++;

2364

-+}

2365

-+

2366

-+/*

2367

-+ * Return expired entry, or NULL to just start from scratch in rbtree.

2368

-+ */

2369

-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2370

-+{

2371

-+	struct request *rq = NULL;

2372

-+

2373

-+	if (bfq_bfqq_fifo_expire(bfqq))

2374

-+		return NULL;

2375

-+

2376

-+	bfq_mark_bfqq_fifo_expire(bfqq);

2377

-+

2378

-+	if (list_empty(&bfqq->fifo))

2379

-+		return NULL;

2380

-+

2381

-+	rq = rq_entry_fifo(bfqq->fifo.next);

2382

-+

2383

-+	if (time_before(jiffies, rq_fifo_time(rq)))

2384

-+		return NULL;

2385

-+

2386

-+	return rq;

2387

-+}

2388

-+

2389

-+/*

2390

-+ * Must be called with the queue_lock held.

2391

-+ */

2392

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

2393

-+{

2394

-+	int process_refs, io_refs;

2395

-+

2396

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2397

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2398

-+	BUG_ON(process_refs < 0);

2399

-+	return process_refs;

2400

-+}

2401

-+

2402

-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2403

-+{

2404

-+	int process_refs, new_process_refs;

2405

-+	struct bfq_queue *__bfqq;

2406

-+

2407

-+	/*

2408

-+	 * If there are no process references on the new_bfqq, then it is

2409

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2410

-+	 * may have dropped their last reference (not just their last process

2411

-+	 * reference).

2412

-+	 */

2413

-+	if (!bfqq_process_refs(new_bfqq))

2414

-+		return;

2415

-+

2416

-+	/* Avoid a circular list and skip interim queue merges. */

2417

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

2418

-+		if (__bfqq == bfqq)

2419

-+			return;

2420

-+		new_bfqq = __bfqq;

2421

-+	}

2422

-+

2423

-+	process_refs = bfqq_process_refs(bfqq);

2424

-+	new_process_refs = bfqq_process_refs(new_bfqq);

2425

-+	/*

2426

-+	 * If the process for the bfqq has gone away, there is no

2427

-+	 * sense in merging the queues.

2428

-+	 */

2429

-+	if (process_refs == 0 || new_process_refs == 0)

2430

-+		return;

2431

-+

2432

-+	/*

2433

-+	 * Merge in the direction of the lesser amount of work.

2434

-+	 */

2435

-+	if (new_process_refs >= process_refs) {

2436

-+		bfqq->new_bfqq = new_bfqq;

2437

-+		atomic_add(process_refs, &new_bfqq->ref);

2438

-+	} else {

2439

-+		new_bfqq->new_bfqq = bfqq;

2440

-+		atomic_add(new_process_refs, &bfqq->ref);

2441

-+	}

2442

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2443

-+		new_bfqq->pid);

2444

-+}

2445

-+

2446

-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2447

-+{

2448

-+	struct bfq_entity *entity = &bfqq->entity;

2449

-+	return entity->budget - entity->service;

2450

-+}

2451

-+

2452

-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2453

-+{

2454

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2455

-+

2456

-+	__bfq_bfqd_reset_in_service(bfqd);

2457

-+

2458

-+	/*

2459

-+	 * If this bfqq is shared between multiple processes, check

2460

-+	 * to make sure that those processes are still issuing I/Os

2461

-+	 * within the mean seek distance. If not, it may be time to

2462

-+	 * break the queues apart again.

2463

-+	 */

2464

-+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2465

-+		bfq_mark_bfqq_split_coop(bfqq);

2466

-+

2467

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2468

-+		/*

2469

-+		 * overloading budget_timeout field to store when

2470

-+		 * the queue remains with no backlog, used by

2471

-+		 * the weight-raising mechanism

2472

-+		 */

2473

-+		bfqq->budget_timeout = jiffies;

2474

-+		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2475

-+	} else {

2476

-+		bfq_activate_bfqq(bfqd, bfqq);

2477

-+		/*

2478

-+		 * Resort priority tree of potential close cooperators.

2479

-+		 */

2480

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

2481

-+	}

2482

-+}

2483

-+

2484

-+/**

2485

-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2486

-+ * @bfqd: device data.

2487

-+ * @bfqq: queue to update.

2488

-+ * @reason: reason for expiration.

2489

-+ *

2490

-+ * Handle the feedback on @bfqq budget.  See the body for detailed

2491

-+ * comments.

2492

-+ */

2493

-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2494

-+				     struct bfq_queue *bfqq,

2495

-+				     enum bfqq_expiration reason)

2496

-+{

2497

-+	struct request *next_rq;

2498

-+	unsigned long budget, min_budget;

2499

-+

2500

-+	budget = bfqq->max_budget;

2501

-+	min_budget = bfq_min_budget(bfqd);

2502

-+

2503

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2504

-+

2505

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2506

-+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2507

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2508

-+		budget, bfq_min_budget(bfqd));

2509

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2510

-+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2511

-+

2512

-+	if (bfq_bfqq_sync(bfqq)) {

2513

-+		switch (reason) {

2514

-+		/*

2515

-+		 * Caveat: in all the following cases we trade latency

2516

-+		 * for throughput.

2517

-+		 */

2518

-+		case BFQ_BFQQ_TOO_IDLE:

2519

-+			/*

2520

-+			 * This is the only case where we may reduce

2521

-+			 * the budget: if there is no requets of the

2522

-+			 * process still waiting for completion, then

2523

-+			 * we assume (tentatively) that the timer has

2524

-+			 * expired because the batch of requests of

2525

-+			 * the process could have been served with a

2526

-+			 * smaller budget.  Hence, betting that

2527

-+			 * process will behave in the same way when it

2528

-+			 * becomes backlogged again, we reduce its

2529

-+			 * next budget.  As long as we guess right,

2530

-+			 * this budget cut reduces the latency

2531

-+			 * experienced by the process.

2532

-+			 *

2533

-+			 * However, if there are still outstanding

2534

-+			 * requests, then the process may have not yet

2535

-+			 * issued its next request just because it is

2536

-+			 * still waiting for the completion of some of

2537

-+			 * the still oustanding ones.  So in this

2538

-+			 * subcase we do not reduce its budget, on the

2539

-+			 * contrary we increase it to possibly boost

2540

-+			 * the throughput, as discussed in the

2541

-+			 * comments to the BUDGET_TIMEOUT case.

2542

-+			 */

2543

-+			if (bfqq->dispatched > 0) /* still oustanding reqs */

2544

-+				budget = min(budget * 2, bfqd->bfq_max_budget);

2545

-+			else {

2546

-+				if (budget > 5 * min_budget)

2547

-+					budget -= 4 * min_budget;

2548

-+				else

2549

-+					budget = min_budget;

2550

-+			}

2551

-+			break;

2552

-+		case BFQ_BFQQ_BUDGET_TIMEOUT:

2553

-+			/*

2554

-+			 * We double the budget here because: 1) it

2555

-+			 * gives the chance to boost the throughput if

2556

-+			 * this is not a seeky process (which may have

2557

-+			 * bumped into this timeout because of, e.g.,

2558

-+			 * ZBR), 2) together with charge_full_budget

2559

-+			 * it helps give seeky processes higher

2560

-+			 * timestamps, and hence be served less

2561

-+			 * frequently.

2562

-+			 */

2563

-+			budget = min(budget * 2, bfqd->bfq_max_budget);

2564

-+			break;

2565

-+		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2566

-+			/*

2567

-+			 * The process still has backlog, and did not

2568

-+			 * let either the budget timeout or the disk

2569

-+			 * idling timeout expire. Hence it is not

2570

-+			 * seeky, has a short thinktime and may be

2571

-+			 * happy with a higher budget too. So

2572

-+			 * definitely increase the budget of this good

2573

-+			 * candidate to boost the disk throughput.

2574

-+			 */

2575

-+			budget = min(budget * 4, bfqd->bfq_max_budget);

2576

-+			break;

2577

-+		case BFQ_BFQQ_NO_MORE_REQUESTS:

2578

-+		       /*

2579

-+			* Leave the budget unchanged.

2580

-+			*/

2581

-+		default:

2582

-+			return;

2583

-+		}

2584

-+	} else /* async queue */

2585

-+	    /* async queues get always the maximum possible budget

2586

-+	     * (their ability to dispatch is limited by

2587

-+	     * @bfqd->bfq_max_budget_async_rq).

2588

-+	     */

2589

-+		budget = bfqd->bfq_max_budget;

2590

-+

2591

-+	bfqq->max_budget = budget;

2592

-+

2593

-+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2594

-+	    bfqq->max_budget > bfqd->bfq_max_budget)

2595

-+		bfqq->max_budget = bfqd->bfq_max_budget;

2596

-+

2597

-+	/*

2598

-+	 * Make sure that we have enough budget for the next request.

2599

-+	 * Since the finish time of the bfqq must be kept in sync with

2600

-+	 * the budget, be sure to call __bfq_bfqq_expire() after the

2601

-+	 * update.

2602

-+	 */

2603

-+	next_rq = bfqq->next_rq;

2604

-+	if (next_rq != NULL)

2605

-+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2606

-+					    bfq_serv_to_charge(next_rq, bfqq));

2607

-+	else

2608

-+		bfqq->entity.budget = bfqq->max_budget;

2609

-+

2610

-+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2611

-+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2612

-+			bfqq->entity.budget);

2613

-+}

2614

-+

2615

-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2616

-+{

2617

-+	unsigned long max_budget;

2618

-+

2619

-+	/*

2620

-+	 * The max_budget calculated when autotuning is equal to the

2621

-+	 * amount of sectors transfered in timeout_sync at the

2622

-+	 * estimated peak rate.

2623

-+	 */

2624

-+	max_budget = (unsigned long)(peak_rate * 1000 *

2625

-+				     timeout >> BFQ_RATE_SHIFT);

2626

-+

2627

-+	return max_budget;

2628

-+}

2629

-+

2630

-+/*

2631

-+ * In addition to updating the peak rate, checks whether the process

2632

-+ * is "slow", and returns 1 if so. This slow flag is used, in addition

2633

-+ * to the budget timeout, to reduce the amount of service provided to

2634

-+ * seeky processes, and hence reduce their chances to lower the

2635

-+ * throughput. See the code for more details.

2636

-+ */

2637

-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2638

-+				int compensate, enum bfqq_expiration reason)

2639

-+{

2640

-+	u64 bw, usecs, expected, timeout;

2641

-+	ktime_t delta;

2642

-+	int update = 0;

2643

-+

2644

-+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2645

-+		return 0;

2646

-+

2647

-+	if (compensate)

2648

-+		delta = bfqd->last_idling_start;

2649

-+	else

2650

-+		delta = ktime_get();

2651

-+	delta = ktime_sub(delta, bfqd->last_budget_start);

2652

-+	usecs = ktime_to_us(delta);

2653

-+

2654

-+	/* Don't trust short/unrealistic values. */

2655

-+	if (usecs < 100 || usecs >= LONG_MAX)

2656

-+		return 0;

2657

-+

2658

-+	/*

2659

-+	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2660

-+	 * value to store the peak rate, in sectors per usec in fixed

2661

-+	 * point math.  We do so to have enough precision in the estimate

2662

-+	 * and to avoid overflows.

2663

-+	 */

2664

-+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2665

-+	do_div(bw, (unsigned long)usecs);

2666

-+

2667

-+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2668

-+

2669

-+	/*

2670

-+	 * Use only long (> 20ms) intervals to filter out spikes for

2671

-+	 * the peak rate estimation.

2672

-+	 */

2673

-+	if (usecs > 20000) {

2674

-+		if (bw > bfqd->peak_rate ||

2675

-+		   (!BFQQ_SEEKY(bfqq) &&

2676

-+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2677

-+			bfq_log(bfqd, "measured bw =%llu", bw);

2678

-+			/*

2679

-+			 * To smooth oscillations use a low-pass filter with

2680

-+			 * alpha=7/8, i.e.,

2681

-+			 * new_rate = (7/8) * old_rate + (1/8) * bw

2682

-+			 */

2683

-+			do_div(bw, 8);

2684

-+			if (bw == 0)

2685

-+				return 0;

2686

-+			bfqd->peak_rate *= 7;

2687

-+			do_div(bfqd->peak_rate, 8);

2688

-+			bfqd->peak_rate += bw;

2689

-+			update = 1;

2690

-+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2691

-+		}

2692

-+

2693

-+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2694

-+

2695

-+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2696

-+			bfqd->peak_rate_samples++;

2697

-+

2698

-+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

2699

-+		    update && bfqd->bfq_user_max_budget == 0) {

2700

-+			bfqd->bfq_max_budget =

2701

-+				bfq_calc_max_budget(bfqd->peak_rate, timeout);

2702

-+			bfq_log(bfqd, "new max_budget=%lu",

2703

-+				bfqd->bfq_max_budget);

2704

-+		}

2705

-+	}

2706

-+

2707

-+	/*

2708

-+	 * If the process has been served for a too short time

2709

-+	 * interval to let its possible sequential accesses prevail on

2710

-+	 * the initial seek time needed to move the disk head on the

2711

-+	 * first sector it requested, then give the process a chance

2712

-+	 * and for the moment return false.

2713

-+	 */

2714

-+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

2715

-+		return 0;

2716

-+

2717

-+	/*

2718

-+	 * A process is considered ``slow'' (i.e., seeky, so that we

2719

-+	 * cannot treat it fairly in the service domain, as it would

2720

-+	 * slow down too much the other processes) if, when a slice

2721

-+	 * ends for whatever reason, it has received service at a

2722

-+	 * rate that would not be high enough to complete the budget

2723

-+	 * before the budget timeout expiration.

2724

-+	 */

2725

-+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

2726

-+

2727

-+	/*

2728

-+	 * Caveat: processes doing IO in the slower disk zones will

2729

-+	 * tend to be slow(er) even if not seeky. And the estimated

2730

-+	 * peak rate will actually be an average over the disk

2731

-+	 * surface. Hence, to not be too harsh with unlucky processes,

2732

-+	 * we keep a budget/3 margin of safety before declaring a

2733

-+	 * process slow.

2734

-+	 */

2735

-+	return expected > (4 * bfqq->entity.budget) / 3;

2736

-+}

2737

-+

2738

-+/*

2739

-+ * To be deemed as soft real-time, an application must meet two requirements.

2740

-+ * The first is that the application must not require an average bandwidth

2741

-+ * higher than the approximate bandwidth required to playback or record a

2742

-+ * compressed high-definition video.

2743

-+ * The next function is invoked on the completion of the last request of a

2744

-+ * batch, to compute the next-start time instant, soft_rt_next_start, such

2745

-+ * that, if the next request of the application does not arrive before

2746

-+ * soft_rt_next_start, then the above requirement on the bandwidth is met.

2747

-+ *

2748

-+ * The second requirement is that the request pattern of the application is

2749

-+ * isochronous, i.e., that, after issuing a request or a batch of requests, the

2750

-+ * application stops for a while, then issues a new batch, and so on. For this

2751

-+ * reason the next function is invoked to compute soft_rt_next_start only for

2752

-+ * applications that meet this requirement, whereas soft_rt_next_start is set

2753

-+ * to infinity for applications that do not.

2754

-+ *

2755

-+ * Unfortunately, even a greedy application may happen to behave in an

2756

-+ * isochronous way if several processes are competing for the CPUs. In fact,

2757

-+ * in this scenario the application stops issuing requests while the CPUs are

2758

-+ * busy serving other processes, then restarts, then stops again for a while,

2759

-+ * and so on. In addition, if the disk achieves a low enough throughput with

2760

-+ * the request pattern issued by the application, then the above bandwidth

2761

-+ * requirement may happen to be met too. To prevent such a greedy application

2762

-+ * to be deemed as soft real-time, a further rule is used in the computation

2763

-+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

2764

-+ * time plus the maximum time for which the arrival of a request is waited

2765

-+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

2766

-+ * filters out greedy applications, as the latter issue instead their next

2767

-+ * request as soon as possible after the last one has been completed (in

2768

-+ * contrast, when a batch of requests is completed, a soft real-time

2769

-+ * application spends some time processing data).

2770

-+ *

2771

-+ * Actually, the last filter may easily generate false positives if: only

2772

-+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

2773

-+ * both the following two cases occur:

2774

-+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

2775

-+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

2776

-+ *    HZ=100.

2777

-+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

2778

-+ *    for a while, then suddenly 'jump' by several units to recover the lost

2779

-+ *    increments. This seems to happen, e.g., inside virtual machines.

2780

-+ * To address this issue, we do not use as a reference time interval just

2781

-+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

2782

-+ * particular we add the minimum number of jiffies for which the filter seems

2783

-+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

2784

-+ */

2785

-+static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

2786

-+					     struct bfq_queue *bfqq)

2787

-+{

2788

-+	return max(bfqq->last_idle_bklogged +

2789

-+		   HZ * bfqq->service_from_backlogged /

2790

-+		   bfqd->bfq_raising_max_softrt_rate,

2791

-+		   (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);

2792

-+}

2793

-+

2794

-+/**

2795

-+ * bfq_bfqq_expire - expire a queue.

2796

-+ * @bfqd: device owning the queue.

2797

-+ * @bfqq: the queue to expire.

2798

-+ * @compensate: if true, compensate for the time spent idling.

2799

-+ * @reason: the reason causing the expiration.

2800

-+ *

2801

-+ *

2802

-+ * If the process associated to the queue is slow (i.e., seeky), or in

2803

-+ * case of budget timeout, or, finally, if it is async, we

2804

-+ * artificially charge it an entire budget (independently of the

2805

-+ * actual service it received). As a consequence, the queue will get

2806

-+ * higher timestamps than the correct ones upon reactivation, and

2807

-+ * hence it will be rescheduled as if it had received more service

2808

-+ * than what it actually received. In the end, this class of processes

2809

-+ * will receive less service in proportion to how slowly they consume

2810

-+ * their budgets (and hence how seriously they tend to lower the

2811

-+ * throughput).

2812

-+ *

2813

-+ * In contrast, when a queue expires because it has been idling for

2814

-+ * too much or because it exhausted its budget, we do not touch the

2815

-+ * amount of service it has received. Hence when the queue will be

2816

-+ * reactivated and its timestamps updated, the latter will be in sync

2817

-+ * with the actual service received by the queue until expiration.

2818

-+ *

2819

-+ * Charging a full budget to the first type of queues and the exact

2820

-+ * service to the others has the effect of using the WF2Q+ policy to

2821

-+ * schedule the former on a timeslice basis, without violating the

2822

-+ * service domain guarantees of the latter.

2823

-+ */

2824

-+static void bfq_bfqq_expire(struct bfq_data *bfqd,

2825

-+			    struct bfq_queue *bfqq,

2826

-+			    int compensate,

2827

-+			    enum bfqq_expiration reason)

2828

-+{

2829

-+	int slow;

2830

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2831

-+

2832

-+	/* Update disk peak rate for autotuning and check whether the

2833

-+	 * process is slow (see bfq_update_peak_rate).

2834

-+	 */

2835

-+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

2836

-+

2837

-+	/*

2838

-+	 * As above explained, 'punish' slow (i.e., seeky), timed-out

2839

-+	 * and async queues, to favor sequential sync workloads.

2840

-+	 *

2841

-+	 * Processes doing IO in the slower disk zones will tend to be

2842

-+	 * slow(er) even if not seeky. Hence, since the estimated peak

2843

-+	 * rate is actually an average over the disk surface, these

2844

-+	 * processes may timeout just for bad luck. To avoid punishing

2845

-+	 * them we do not charge a full budget to a process that

2846

-+	 * succeeded in consuming at least 2/3 of its budget.

2847

-+	 */

2848

-+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

2849

-+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

2850

-+		bfq_bfqq_charge_full_budget(bfqq);

2851

-+

2852

-+	bfqq->service_from_backlogged += bfqq->entity.service;

2853

-+

2854

-+	if (bfqd->low_latency && bfqq->raising_coeff == 1)

2855

-+		bfqq->last_rais_start_finish = jiffies;

2856

-+

2857

-+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

2858

-+		if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&

2859

-+		    reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {

2860

-+			/*

2861

-+			 * If we get here, then the request pattern is

2862

-+			 * isochronous (see the comments to the function

2863

-+			 * bfq_bfqq_softrt_next_start()). However, if the

2864

-+			 * queue still has in-flight requests, then it is

2865

-+			 * better to postpone the computation of next_start

2866

-+			 * to the next request completion. In fact, if we

2867

-+			 * computed it now, then the application might pass

2868

-+			 * the greedy-application filter improperly, because

2869

-+			 * the arrival of its next request may  happen to be

2870

-+			 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

2871

-+			 * not because the application is truly soft real-

2872

-+			 * time, but just because the application is currently

2873

-+			 * waiting for the completion of some request before

2874

-+			 * issuing, as quickly as possible, its next request.

2875

-+			 */

2876

-+			if (bfqq->dispatched > 0) {

2877

-+				bfqq->soft_rt_next_start = -1;

2878

-+				bfq_mark_bfqq_softrt_update(bfqq);

2879

-+			} else

2880

-+				bfqq->soft_rt_next_start =

2881

-+					bfq_bfqq_softrt_next_start(bfqd, bfqq);

2882

-+		} else

2883

-+			bfqq->soft_rt_next_start = -1; /* infinity */

2884

-+	}

2885

-+

2886

-+	bfq_log_bfqq(bfqd, bfqq,

2887

-+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

2888

-+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

2889

-+

2890

-+	/* Increase, decrease or leave budget unchanged according to reason */

2891

-+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

2892

-+	__bfq_bfqq_expire(bfqd, bfqq);

2893

-+}

2894

-+

2895

-+/*

2896

-+ * Budget timeout is not implemented through a dedicated timer, but

2897

-+ * just checked on request arrivals and completions, as well as on

2898

-+ * idle timer expirations.

2899

-+ */

2900

-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

2901

-+{

2902

-+	if (bfq_bfqq_budget_new(bfqq))

2903

-+		return 0;

2904

-+

2905

-+	if (time_before(jiffies, bfqq->budget_timeout))

2906

-+		return 0;

2907

-+

2908

-+	return 1;

2909

-+}

2910

-+

2911

-+/*

2912

-+ * If we expire a queue that is waiting for the arrival of a new

2913

-+ * request, we may prevent the fictitious timestamp backshifting that

2914

-+ * allows the guarantees of the queue to be preserved (see [1] for

2915

-+ * this tricky aspect). Hence we return true only if this condition

2916

-+ * does not hold, or if the queue is slow enough to deserve only to be

2917

-+ * kicked off for preserving a high throughput.

2918

-+*/

2919

-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

2920

-+{

2921

-+	bfq_log_bfqq(bfqq->bfqd, bfqq,

2922

-+		"may_budget_timeout: wr %d left %d timeout %d",

2923

-+		bfq_bfqq_wait_request(bfqq),

2924

-+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

2925

-+		bfq_bfqq_budget_timeout(bfqq));

2926

-+

2927

-+	return (!bfq_bfqq_wait_request(bfqq) ||

2928

-+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

2929

-+		&&

2930

-+		bfq_bfqq_budget_timeout(bfqq);

2931

-+}

2932

-+

2933

-+/*

2934

-+ * For weight-raised queues issuing sync requests, idling is always performed,

2935

-+ * as this is instrumental in guaranteeing a high fraction of the throughput

2936

-+ * to these queues, and hence in guaranteeing a lower latency for their

2937

-+ * requests. See [1] for details.

2938

-+ *

2939

-+ * For non-weight-raised queues, idling is instead disabled if the device is

2940

-+ * NCQ-enabled and non-rotational, as this boosts the throughput on such

2941

-+ * devices.

2942

-+ */

2943

-+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

2944

-+{

2945

-+	struct bfq_data *bfqd = bfqq->bfqd;

2946

-+

2947

-+	return bfq_bfqq_sync(bfqq) && (

2948

-+		bfqq->raising_coeff > 1 ||

2949

-+		(bfq_bfqq_idle_window(bfqq) &&

2950

-+		 !(bfqd->hw_tag &&

2951

-+		   (blk_queue_nonrot(bfqd->queue) ||

2952

-+		 /*

2953

-+		  * If there are weight-raised busy queues, then do not idle

2954

-+		  * the disk for a sync non-weight-raised queue, and hence

2955

-+		  * expire the queue immediately if empty. Combined with the

2956

-+		  * timestamping rules of BFQ (see [1] for details), this

2957

-+		  * causes sync non-weight-raised queues to get a lower

2958

-+		  * fraction of the disk throughput, and hence reduces the rate

2959

-+		  * at which the processes associated to these queues ask for

2960

-+		  * requests from the request pool.

2961

-+		  *

2962

-+		  * This is beneficial for weight-raised processes, when the

2963

-+		  * system operates in request-pool saturation conditions

2964

-+		  * (e.g., in the presence of write hogs). In fact, if

2965

-+		  * non-weight-raised processes ask for requests at a lower

2966

-+		  * rate, then weight-raised processes have a higher

2967

-+		  * probability to get a request from the pool immediately

2968

-+		  * (or at least soon) when they need one. Hence they have a

2969

-+		  * higher probability to actually get a fraction of the disk

2970

-+		  * throughput proportional to their high weight. This is

2971

-+		  * especially true with NCQ-enabled drives, which enqueue

2972

-+		  * several requests in advance and further reorder

2973

-+		  * internally-queued requests.

2974

-+		  *

2975

-+		  * Mistreating non-weight-raised queues in the above-described

2976

-+		  * way, when there are busy weight-raised queues, seems to

2977

-+		  * mitigate starvation problems in the presence of heavy write

2978

-+		  * workloads and NCQ, and hence to guarantee a higher

2979

-+		  * application and system responsiveness in these hostile

2980

-+		  * scenarios.

2981

-+		  */

2982

-+		    bfqd->raised_busy_queues > 0)

2983

-+		  )

2984

-+		)

2985

-+	);

2986

-+}

2987

-+

2988

-+/*

2989

-+ * If the in-service queue is empty, but it is sync and either of the following

2990

-+ * conditions holds, then: 1) the queue must remain in service and cannot be

2991

-+ * expired, and 2) the disk must be idled to wait for the possible arrival

2992

-+ * of a new request for the queue. The conditions are:

2993

-+ * - the device is rotational and not performing NCQ, and the queue has its

2994

-+ *   idle window set (in this case, waiting for a new request for the queue

2995

-+ *   is likely to boost the disk throughput);

2996

-+ * - the queue is weight-raised (waiting for the request is necessary to

2997

-+ *   provide the queue with fairness and latency guarantees, see [1] for

2998

-+ *   details).

2999

-+ */

3000

-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3001

-+{

3002

-+	struct bfq_data *bfqd = bfqq->bfqd;

3003

-+

3004

-+	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3005

-+		bfq_bfqq_must_not_expire(bfqq) &&

3006

-+		!bfq_queue_nonrot_noidle(bfqd, bfqq));

3007

-+}

3008

-+

3009

-+/*

3010

-+ * Select a queue for service.  If we have a current queue in service,

3011

-+ * check whether to continue servicing it, or retrieve and set a new one.

3012

-+ */

3013

-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3014

-+{

3015

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

3016

-+	struct request *next_rq;

3017

-+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3018

-+

3019

-+	bfqq = bfqd->in_service_queue;

3020

-+	if (bfqq == NULL)

3021

-+		goto new_queue;

3022

-+

3023

-+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3024

-+

3025

-+	/*

3026

-+         * If another queue has a request waiting within our mean seek

3027

-+         * distance, let it run. The expire code will check for close

3028

-+         * cooperators and put the close queue at the front of the

3029

-+         * service tree. If possible, merge the expiring queue with the

3030

-+         * new bfqq.

3031

-+         */

3032

-+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3033

-+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3034

-+                bfq_setup_merge(bfqq, new_bfqq);

3035

-+

3036

-+	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3037

-+	    !timer_pending(&bfqd->idle_slice_timer) &&

3038

-+	    !bfq_bfqq_must_idle(bfqq))

3039

-+		goto expire;

3040

-+

3041

-+	next_rq = bfqq->next_rq;

3042

-+	/*

3043

-+	 * If bfqq has requests queued and it has enough budget left to

3044

-+	 * serve them, keep the queue, otherwise expire it.

3045

-+	 */

3046

-+	if (next_rq != NULL) {

3047

-+		if (bfq_serv_to_charge(next_rq, bfqq) >

3048

-+			bfq_bfqq_budget_left(bfqq)) {

3049

-+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3050

-+			goto expire;

3051

-+		} else {

3052

-+			/*

3053

-+			 * The idle timer may be pending because we may not

3054

-+			 * disable disk idling even when a new request arrives

3055

-+			 */

3056

-+			if (timer_pending(&bfqd->idle_slice_timer)) {

3057

-+				/*

3058

-+				 * If we get here: 1) at least a new request

3059

-+				 * has arrived but we have not disabled the

3060

-+				 * timer because the request was too small,

3061

-+				 * 2) then the block layer has unplugged the

3062

-+				 * device, causing the dispatch to be invoked.

3063

-+				 *

3064

-+				 * Since the device is unplugged, now the

3065

-+				 * requests are probably large enough to

3066

-+				 * provide a reasonable throughput.

3067

-+				 * So we disable idling.

3068

-+				 */

3069

-+				bfq_clear_bfqq_wait_request(bfqq);

3070

-+				del_timer(&bfqd->idle_slice_timer);

3071

-+			}

3072

-+			if (new_bfqq == NULL)

3073

-+				goto keep_queue;

3074

-+			else

3075

-+				goto expire;

3076

-+		}

3077

-+	}

3078

-+

3079

-+	/*

3080

-+	 * No requests pending.  If the in-service queue has no cooperator and

3081

-+	 * still has requests in flight (possibly waiting for a completion)

3082

-+	 * or is idling for a new request, then keep it.

3083

-+	 */

3084

-+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3085

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3086

-+		bfqq = NULL;

3087

-+		goto keep_queue;

3088

-+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3089

-+		/*

3090

-+		 * Expiring the queue because there is a close cooperator,

3091

-+		 * cancel timer.

3092

-+		 */

3093

-+		bfq_clear_bfqq_wait_request(bfqq);

3094

-+		del_timer(&bfqd->idle_slice_timer);

3095

-+	}

3096

-+

3097

-+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3098

-+expire:

3099

-+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3100

-+new_queue:

3101

-+	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3102

-+	bfq_log(bfqd, "select_queue: new queue %d returned",

3103

-+		bfqq != NULL ? bfqq->pid : 0);

3104

-+keep_queue:

3105

-+	return bfqq;

3106

-+}

3107

-+

3108

-+static void bfq_update_raising_data(struct bfq_data *bfqd,

3109

-+				    struct bfq_queue *bfqq)

3110

-+{

3111

-+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

3112

-+		struct bfq_entity *entity = &bfqq->entity;

3113

-+

3114

-+		bfq_log_bfqq(bfqd, bfqq,

3115

-+			"raising period dur %u/%u msec, "

3116

-+			"old raising coeff %u, w %d(%d)",

3117

-+			jiffies_to_msecs(jiffies -

3118

-+				bfqq->last_rais_start_finish),

3119

-+			jiffies_to_msecs(bfqq->raising_cur_max_time),

3120

-+			bfqq->raising_coeff,

3121

-+			bfqq->entity.weight, bfqq->entity.orig_weight);

3122

-+

3123

-+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3124

-+			entity->orig_weight * bfqq->raising_coeff);

3125

-+		if (entity->ioprio_changed)

3126

-+			bfq_log_bfqq(bfqd, bfqq,

3127

-+			"WARN: pending prio change");

3128

-+		/*

3129

-+		 * If too much time has elapsed from the beginning

3130

-+		 * of this weight-raising, stop it.

3131

-+		 */

3132

-+		if (jiffies - bfqq->last_rais_start_finish >

3133

-+			bfqq->raising_cur_max_time) {

3134

-+			bfqq->last_rais_start_finish = jiffies;

3135

-+			bfq_log_bfqq(bfqd, bfqq,

3136

-+				     "wrais ending at %llu msec,"

3137

-+				     "rais_max_time %u",

3138

-+				     bfqq->last_rais_start_finish,

3139

-+				     jiffies_to_msecs(bfqq->

3140

-+					raising_cur_max_time));

3141

-+			bfq_bfqq_end_raising(bfqq);

3142

-+			__bfq_entity_update_weight_prio(

3143

-+				bfq_entity_service_tree(entity),

3144

-+				entity);

3145

-+		}

3146

-+	}

3147

-+}

3148

-+

3149

-+/*

3150

-+ * Dispatch one request from bfqq, moving it to the request queue

3151

-+ * dispatch list.

3152

-+ */

3153

-+static int bfq_dispatch_request(struct bfq_data *bfqd,

3154

-+				struct bfq_queue *bfqq)

3155

-+{

3156

-+	int dispatched = 0;

3157

-+	struct request *rq;

3158

-+	unsigned long service_to_charge;

3159

-+

3160

-+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3161

-+

3162

-+	/* Follow expired path, else get first next available. */

3163

-+	rq = bfq_check_fifo(bfqq);

3164

-+	if (rq == NULL)

3165

-+		rq = bfqq->next_rq;

3166

-+	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3167

-+

3168

-+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3169

-+		/*

3170

-+		 * This may happen if the next rq is chosen

3171

-+		 * in fifo order instead of sector order.

3172

-+		 * The budget is properly dimensioned

3173

-+		 * to be always sufficient to serve the next request

3174

-+		 * only if it is chosen in sector order. The reason is

3175

-+		 * that it would be quite inefficient and little useful

3176

-+		 * to always make sure that the budget is large enough

3177

-+		 * to serve even the possible next rq in fifo order.

3178

-+		 * In fact, requests are seldom served in fifo order.

3179

-+		 *

3180

-+		 * Expire the queue for budget exhaustion, and

3181

-+		 * make sure that the next act_budget is enough

3182

-+		 * to serve the next request, even if it comes

3183

-+		 * from the fifo expired path.

3184

-+		 */

3185

-+		bfqq->next_rq = rq;

3186

-+		/*

3187

-+		 * Since this dispatch is failed, make sure that

3188

-+		 * a new one will be performed

3189

-+		 */

3190

-+		if (!bfqd->rq_in_driver)

3191

-+			bfq_schedule_dispatch(bfqd);

3192

-+		goto expire;

3193

-+	}

3194

-+

3195

-+	/* Finally, insert request into driver dispatch list. */

3196

-+	bfq_bfqq_served(bfqq, service_to_charge);

3197

-+	bfq_dispatch_insert(bfqd->queue, rq);

3198

-+

3199

-+	bfq_update_raising_data(bfqd, bfqq);

3200

-+

3201

-+	bfq_log_bfqq(bfqd, bfqq,

3202

-+			"dispatched %u sec req (%llu), budg left %lu",

3203

-+			blk_rq_sectors(rq),

3204

-+			(long long unsigned)blk_rq_pos(rq),

3205

-+			bfq_bfqq_budget_left(bfqq));

3206

-+

3207

-+	dispatched++;

3208

-+

3209

-+	if (bfqd->in_service_bic == NULL) {

3210

-+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3211

-+		bfqd->in_service_bic = RQ_BIC(rq);

3212

-+	}

3213

-+

3214

-+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3215

-+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3216

-+	    bfq_class_idle(bfqq)))

3217

-+		goto expire;

3218

-+

3219

-+	return dispatched;

3220

-+

3221

-+expire:

3222

-+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3223

-+	return dispatched;

3224

-+}

3225

-+

3226

-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3227

-+{

3228

-+	int dispatched = 0;

3229

-+

3230

-+	while (bfqq->next_rq != NULL) {

3231

-+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3232

-+		dispatched++;

3233

-+	}

3234

-+

3235

-+	BUG_ON(!list_empty(&bfqq->fifo));

3236

-+	return dispatched;

3237

-+}

3238

-+

3239

-+/*

3240

-+ * Drain our current requests.  Used for barriers and when switching

3241

-+ * io schedulers on-the-fly.

3242

-+ */

3243

-+static int bfq_forced_dispatch(struct bfq_data *bfqd)

3244

-+{

3245

-+	struct bfq_queue *bfqq, *n;

3246

-+	struct bfq_service_tree *st;

3247

-+	int dispatched = 0;

3248

-+

3249

-+	bfqq = bfqd->in_service_queue;

3250

-+	if (bfqq != NULL)

3251

-+		__bfq_bfqq_expire(bfqd, bfqq);

3252

-+

3253

-+	/*

3254

-+	 * Loop through classes, and be careful to leave the scheduler

3255

-+	 * in a consistent state, as feedback mechanisms and vtime

3256

-+	 * updates cannot be disabled during the process.

3257

-+	 */

3258

-+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3259

-+		st = bfq_entity_service_tree(&bfqq->entity);

3260

-+

3261

-+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3262

-+		bfqq->max_budget = bfq_max_budget(bfqd);

3263

-+

3264

-+		bfq_forget_idle(st);

3265

-+	}

3266

-+

3267

-+	BUG_ON(bfqd->busy_queues != 0);

3268

-+

3269

-+	return dispatched;

3270

-+}

3271

-+

3272

-+static int bfq_dispatch_requests(struct request_queue *q, int force)

3273

-+{

3274

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3275

-+	struct bfq_queue *bfqq;

3276

-+	int max_dispatch;

3277

-+

3278

-+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3279

-+	if (bfqd->busy_queues == 0)

3280

-+		return 0;

3281

-+

3282

-+	if (unlikely(force))

3283

-+		return bfq_forced_dispatch(bfqd);

3284

-+

3285

-+	bfqq = bfq_select_queue(bfqd);

3286

-+	if (bfqq == NULL)

3287

-+		return 0;

3288

-+

3289

-+	max_dispatch = bfqd->bfq_quantum;

3290

-+	if (bfq_class_idle(bfqq))

3291

-+		max_dispatch = 1;

3292

-+

3293

-+	if (!bfq_bfqq_sync(bfqq))

3294

-+		max_dispatch = bfqd->bfq_max_budget_async_rq;

3295

-+

3296

-+	if (bfqq->dispatched >= max_dispatch) {

3297

-+		if (bfqd->busy_queues > 1)

3298

-+			return 0;

3299

-+		if (bfqq->dispatched >= 4 * max_dispatch)

3300

-+			return 0;

3301

-+	}

3302

-+

3303

-+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3304

-+		return 0;

3305

-+

3306

-+	bfq_clear_bfqq_wait_request(bfqq);

3307

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3308

-+

3309

-+	if (!bfq_dispatch_request(bfqd, bfqq))

3310

-+		return 0;

3311

-+

3312

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

3313

-+			bfqq->pid, max_dispatch);

3314

-+

3315

-+	return 1;

3316

-+}

3317

-+

3318

-+/*

3319

-+ * Task holds one reference to the queue, dropped when task exits.  Each rq

3320

-+ * in-flight on this queue also holds a reference, dropped when rq is freed.

3321

-+ *

3322

-+ * Queue lock must be held here.

3323

-+ */

3324

-+static void bfq_put_queue(struct bfq_queue *bfqq)

3325

-+{

3326

-+	struct bfq_data *bfqd = bfqq->bfqd;

3327

-+

3328

-+	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3329

-+

3330

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3331

-+		     atomic_read(&bfqq->ref));

3332

-+	if (!atomic_dec_and_test(&bfqq->ref))

3333

-+		return;

3334

-+

3335

-+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3336

-+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3337

-+	BUG_ON(bfqq->entity.tree != NULL);

3338

-+	BUG_ON(bfq_bfqq_busy(bfqq));

3339

-+	BUG_ON(bfqd->in_service_queue == bfqq);

3340

-+

3341

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3342

-+

3343

-+	kmem_cache_free(bfq_pool, bfqq);

3344

-+}

3345

-+

3346

-+static void bfq_put_cooperator(struct bfq_queue *bfqq)

3347

-+{

3348

-+	struct bfq_queue *__bfqq, *next;

3349

-+

3350

-+	/*

3351

-+	 * If this queue was scheduled to merge with another queue, be

3352

-+	 * sure to drop the reference taken on that queue (and others in

3353

-+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3354

-+	 */

3355

-+	__bfqq = bfqq->new_bfqq;

3356

-+	while (__bfqq) {

3357

-+		if (__bfqq == bfqq) {

3358

-+			WARN(1, "bfqq->new_bfqq loop detected.\n");

3359

-+			break;

3360

-+		}

3361

-+		next = __bfqq->new_bfqq;

3362

-+		bfq_put_queue(__bfqq);

3363

-+		__bfqq = next;

3364

-+	}

3365

-+}

3366

-+

3367

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3368

-+{

3369

-+	if (bfqq == bfqd->in_service_queue) {

3370

-+		__bfq_bfqq_expire(bfqd, bfqq);

3371

-+		bfq_schedule_dispatch(bfqd);

3372

-+	}

3373

-+

3374

-+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3375

-+		     atomic_read(&bfqq->ref));

3376

-+

3377

-+	bfq_put_cooperator(bfqq);

3378

-+

3379

-+	bfq_put_queue(bfqq);

3380

-+}

3381

-+

3382

-+static void bfq_init_icq(struct io_cq *icq)

3383

-+{

3384

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3385

-+

3386

-+	bic->ttime.last_end_request = jiffies;

3387

-+}

3388

-+

3389

-+static void bfq_exit_icq(struct io_cq *icq)

3390

-+{

3391

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3392

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

3393

-+

3394

-+	if (bic->bfqq[BLK_RW_ASYNC]) {

3395

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3396

-+		bic->bfqq[BLK_RW_ASYNC] = NULL;

3397

-+	}

3398

-+

3399

-+	if (bic->bfqq[BLK_RW_SYNC]) {

3400

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3401

-+		bic->bfqq[BLK_RW_SYNC] = NULL;

3402

-+	}

3403

-+}

3404

-+

3405

-+/*

3406

-+ * Update the entity prio values; note that the new values will not

3407

-+ * be used until the next (re)activation.

3408

-+ */

3409

-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3410

-+{

3411

-+	struct task_struct *tsk = current;

3412

-+	int ioprio_class;

3413

-+

3414

-+	if (!bfq_bfqq_prio_changed(bfqq))

3415

-+		return;

3416

-+

3417

-+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3418

-+	switch (ioprio_class) {

3419

-+	default:

3420

-+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3421

-+			"bfq: bad prio %x\n", ioprio_class);

3422

-+	case IOPRIO_CLASS_NONE:

3423

-+		/*

3424

-+		 * No prio set, inherit CPU scheduling settings.

3425

-+		 */

3426

-+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3427

-+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3428

-+		break;

3429

-+	case IOPRIO_CLASS_RT:

3430

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3431

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3432

-+		break;

3433

-+	case IOPRIO_CLASS_BE:

3434

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3435

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3436

-+		break;

3437

-+	case IOPRIO_CLASS_IDLE:

3438

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3439

-+		bfqq->entity.new_ioprio = 7;

3440

-+		bfq_clear_bfqq_idle_window(bfqq);

3441

-+		break;

3442

-+	}

3443

-+

3444

-+	bfqq->entity.ioprio_changed = 1;

3445

-+

3446

-+	/*

3447

-+	 * Keep track of original prio settings in case we have to temporarily

3448

-+	 * elevate the priority of this queue.

3449

-+	 */

3450

-+	bfqq->org_ioprio = bfqq->entity.new_ioprio;

3451

-+	bfq_clear_bfqq_prio_changed(bfqq);

3452

-+}

3453

-+

3454

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3455

-+{

3456

-+	struct bfq_data *bfqd;

3457

-+	struct bfq_queue *bfqq, *new_bfqq;

3458

-+	struct bfq_group *bfqg;

3459

-+	unsigned long uninitialized_var(flags);

3460

-+	int ioprio = bic->icq.ioc->ioprio;

3461

-+

3462

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3463

-+				   &flags);

3464

-+	/*

3465

-+	 * This condition may trigger on a newly created bic, be sure to drop

3466

-+	 * the lock before returning.

3467

-+	 */

3468

-+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3469

-+		goto out;

3470

-+

3471

-+	bfqq = bic->bfqq[BLK_RW_ASYNC];

3472

-+	if (bfqq != NULL) {

3473

-+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3474

-+				    sched_data);

3475

-+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3476

-+					 GFP_ATOMIC);

3477

-+		if (new_bfqq != NULL) {

3478

-+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3479

-+			bfq_log_bfqq(bfqd, bfqq,

3480

-+				     "changed_ioprio: bfqq %p %d",

3481

-+				     bfqq, atomic_read(&bfqq->ref));

3482

-+			bfq_put_queue(bfqq);

3483

-+		}

3484

-+	}

3485

-+

3486

-+	bfqq = bic->bfqq[BLK_RW_SYNC];

3487

-+	if (bfqq != NULL)

3488

-+		bfq_mark_bfqq_prio_changed(bfqq);

3489

-+

3490

-+	bic->ioprio = ioprio;

3491

-+

3492

-+out:

3493

-+	bfq_put_bfqd_unlock(bfqd, &flags);

3494

-+}

3495

-+

3496

-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3497

-+			  pid_t pid, int is_sync)

3498

-+{

3499

-+	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3500

-+	INIT_LIST_HEAD(&bfqq->fifo);

3501

-+

3502

-+	atomic_set(&bfqq->ref, 0);

3503

-+	bfqq->bfqd = bfqd;

3504

-+

3505

-+	bfq_mark_bfqq_prio_changed(bfqq);

3506

-+

3507

-+	if (is_sync) {

3508

-+		if (!bfq_class_idle(bfqq))

3509

-+			bfq_mark_bfqq_idle_window(bfqq);

3510

-+		bfq_mark_bfqq_sync(bfqq);

3511

-+	}

3512

-+

3513

-+	/* Tentative initial value to trade off between thr and lat */

3514

-+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3515

-+	bfqq->pid = pid;

3516

-+

3517

-+	bfqq->raising_coeff = 1;

3518

-+	bfqq->last_rais_start_finish = 0;

3519

-+	bfqq->soft_rt_next_start = -1;

3520

-+}

3521

-+

3522

-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

3523

-+					      struct bfq_group *bfqg,

3524

-+					      int is_sync,

3525

-+					      struct bfq_io_cq *bic,

3526

-+					      gfp_t gfp_mask)

3527

-+{

3528

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

3529

-+

3530

-+retry:

3531

-+	/* bic always exists here */

3532

-+	bfqq = bic_to_bfqq(bic, is_sync);

3533

-+

3534

-+	/*

3535

-+	 * Always try a new alloc if we fall back to the OOM bfqq

3536

-+	 * originally, since it should just be a temporary situation.

3537

-+	 */

3538

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3539

-+		bfqq = NULL;

3540

-+		if (new_bfqq != NULL) {

3541

-+			bfqq = new_bfqq;

3542

-+			new_bfqq = NULL;

3543

-+		} else if (gfp_mask & __GFP_WAIT) {

3544

-+			spin_unlock_irq(bfqd->queue->queue_lock);

3545

-+			new_bfqq = kmem_cache_alloc_node(bfq_pool,

3546

-+					gfp_mask | __GFP_ZERO,

3547

-+					bfqd->queue->node);

3548

-+			spin_lock_irq(bfqd->queue->queue_lock);

3549

-+			if (new_bfqq != NULL)

3550

-+				goto retry;

3551

-+		} else {

3552

-+			bfqq = kmem_cache_alloc_node(bfq_pool,

3553

-+					gfp_mask | __GFP_ZERO,

3554

-+					bfqd->queue->node);

3555

-+		}

3556

-+

3557

-+		if (bfqq != NULL) {

3558

-+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

3559

-+			bfq_log_bfqq(bfqd, bfqq, "allocated");

3560

-+		} else {

3561

-+			bfqq = &bfqd->oom_bfqq;

3562

-+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

3563

-+		}

3564

-+

3565

-+		bfq_init_prio_data(bfqq, bic);

3566

-+		bfq_init_entity(&bfqq->entity, bfqg);

3567

-+	}

3568

-+

3569

-+	if (new_bfqq != NULL)

3570

-+		kmem_cache_free(bfq_pool, new_bfqq);

3571

-+

3572

-+	return bfqq;

3573

-+}

3574

-+

3575

-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

3576

-+					       struct bfq_group *bfqg,

3577

-+					       int ioprio_class, int ioprio)

3578

-+{

3579

-+	switch (ioprio_class) {

3580

-+	case IOPRIO_CLASS_RT:

3581

-+		return &bfqg->async_bfqq[0][ioprio];

3582

-+	case IOPRIO_CLASS_NONE:

3583

-+		ioprio = IOPRIO_NORM;

3584

-+		/* fall through */

3585

-+	case IOPRIO_CLASS_BE:

3586

-+		return &bfqg->async_bfqq[1][ioprio];

3587

-+	case IOPRIO_CLASS_IDLE:

3588

-+		return &bfqg->async_idle_bfqq;

3589

-+	default:

3590

-+		BUG();

3591

-+	}

3592

-+}

3593

-+

3594

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

3595

-+				       struct bfq_group *bfqg, int is_sync,

3596

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask)

3597

-+{

3598

-+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3599

-+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3600

-+	struct bfq_queue **async_bfqq = NULL;

3601

-+	struct bfq_queue *bfqq = NULL;

3602

-+

3603

-+	if (!is_sync) {

3604

-+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

3605

-+						  ioprio);

3606

-+		bfqq = *async_bfqq;

3607

-+	}

3608

-+

3609

-+	if (bfqq == NULL)

3610

-+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3611

-+

3612

-+	/*

3613

-+	 * Pin the queue now that it's allocated, scheduler exit will prune it.

3614

-+	 */

3615

-+	if (!is_sync && *async_bfqq == NULL) {

3616

-+		atomic_inc(&bfqq->ref);

3617

-+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

3618

-+			     bfqq, atomic_read(&bfqq->ref));

3619

-+		*async_bfqq = bfqq;

3620

-+	}

3621

-+

3622

-+	atomic_inc(&bfqq->ref);

3623

-+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

3624

-+		     atomic_read(&bfqq->ref));

3625

-+	return bfqq;

3626

-+}

3627

-+

3628

-+static void bfq_update_io_thinktime(struct bfq_data *bfqd,

3629

-+				    struct bfq_io_cq *bic)

3630

-+{

3631

-+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

3632

-+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

3633

-+

3634

-+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

3635

-+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

3636

-+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

3637

-+				bic->ttime.ttime_samples;

3638

-+}

3639

-+

3640

-+static void bfq_update_io_seektime(struct bfq_data *bfqd,

3641

-+				   struct bfq_queue *bfqq,

3642

-+				   struct request *rq)

3643

-+{

3644

-+	sector_t sdist;

3645

-+	u64 total;

3646

-+

3647

-+	if (bfqq->last_request_pos < blk_rq_pos(rq))

3648

-+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

3649

-+	else

3650

-+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

3651

-+

3652

-+	/*

3653

-+	 * Don't allow the seek distance to get too large from the

3654

-+	 * odd fragment, pagein, etc.

3655

-+	 */

3656

-+	if (bfqq->seek_samples == 0) /* first request, not really a seek */

3657

-+		sdist = 0;

3658

-+	else if (bfqq->seek_samples <= 60) /* second & third seek */

3659

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

3660

-+	else

3661

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

3662

-+

3663

-+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

3664

-+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

3665

-+	total = bfqq->seek_total + (bfqq->seek_samples/2);

3666

-+	do_div(total, bfqq->seek_samples);

3667

-+	bfqq->seek_mean = (sector_t)total;

3668

-+

3669

-+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

3670

-+			(u64)bfqq->seek_mean);

3671

-+}

3672

-+

3673

-+/*

3674

-+ * Disable idle window if the process thinks too long or seeks so much that

3675

-+ * it doesn't matter.

3676

-+ */

3677

-+static void bfq_update_idle_window(struct bfq_data *bfqd,

3678

-+				   struct bfq_queue *bfqq,

3679

-+				   struct bfq_io_cq *bic)

3680

-+{

3681

-+	int enable_idle;

3682

-+

3683

-+	/* Don't idle for async or idle io prio class. */

3684

-+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

3685

-+		return;

3686

-+

3687

-+	enable_idle = bfq_bfqq_idle_window(bfqq);

3688

-+

3689

-+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

3690

-+	    bfqd->bfq_slice_idle == 0 ||

3691

-+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

3692

-+			bfqq->raising_coeff == 1))

3693

-+		enable_idle = 0;

3694

-+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

3695

-+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

3696

-+			bfqq->raising_coeff == 1)

3697

-+			enable_idle = 0;

3698

-+		else

3699

-+			enable_idle = 1;

3700

-+	}

3701

-+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

3702

-+		enable_idle);

3703

-+

3704

-+	if (enable_idle)

3705

-+		bfq_mark_bfqq_idle_window(bfqq);

3706

-+	else

3707

-+		bfq_clear_bfqq_idle_window(bfqq);

3708

-+}

3709

-+

3710

-+/*

3711

-+ * Called when a new fs request (rq) is added to bfqq.  Check if there's

3712

-+ * something we should do about it.

3713

-+ */

3714

-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3715

-+			    struct request *rq)

3716

-+{

3717

-+	struct bfq_io_cq *bic = RQ_BIC(rq);

3718

-+

3719

-+	if (rq->cmd_flags & REQ_META)

3720

-+		bfqq->meta_pending++;

3721

-+

3722

-+	bfq_update_io_thinktime(bfqd, bic);

3723

-+	bfq_update_io_seektime(bfqd, bfqq, rq);

3724

-+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

3725

-+	    !BFQQ_SEEKY(bfqq))

3726

-+		bfq_update_idle_window(bfqd, bfqq, bic);

3727

-+

3728

-+	bfq_log_bfqq(bfqd, bfqq,

3729

-+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

3730

-+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

3731

-+		     (long long unsigned)bfqq->seek_mean);

3732

-+

3733

-+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3734

-+

3735

-+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

3736

-+		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

3737

-+				blk_rq_sectors(rq) < 32;

3738

-+		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

3739

-+

3740

-+		/*

3741

-+		 * There is just this request queued: if the request

3742

-+		 * is small and the queue is not to be expired, then

3743

-+		 * just exit.

3744

-+		 *

3745

-+		 * In this way, if the disk is being idled to wait for

3746

-+		 * a new request from the in-service queue, we avoid

3747

-+		 * unplugging the device and committing the disk to serve

3748

-+		 * just a small request. On the contrary, we wait for

3749

-+		 * the block layer to decide when to unplug the device:

3750

-+		 * hopefully, new requests will be merged to this one

3751

-+		 * quickly, then the device will be unplugged and

3752

-+		 * larger requests will be dispatched.

3753

-+		 */

3754

-+		if (small_req && !budget_timeout)

3755

-+			return;

3756

-+

3757

-+		/*

3758

-+		 * A large enough request arrived, or the queue is to

3759

-+		 * be expired: in both cases disk idling is to be

3760

-+		 * stopped, so clear wait_request flag and reset

3761

-+		 * timer.

3762

-+		 */

3763

-+		bfq_clear_bfqq_wait_request(bfqq);

3764

-+		del_timer(&bfqd->idle_slice_timer);

3765

-+

3766

-+		/*

3767

-+		 * The queue is not empty, because a new request just

3768

-+		 * arrived. Hence we can safely expire the queue, in

3769

-+		 * case of budget timeout, without risking that the

3770

-+		 * timestamps of the queue are not updated correctly.

3771

-+		 * See [1] for more details.

3772

-+		 */

3773

-+		if (budget_timeout)

3774

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3775

-+

3776

-+		/*

3777

-+		 * Let the request rip immediately, or let a new queue be

3778

-+		 * selected if bfqq has just been expired.

3779

-+		 */

3780

-+		__blk_run_queue(bfqd->queue);

3781

-+	}

3782

-+}

3783

-+

3784

-+static void bfq_insert_request(struct request_queue *q, struct request *rq)

3785

-+{

3786

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3787

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3788

-+

3789

-+	assert_spin_locked(bfqd->queue->queue_lock);

3790

-+	bfq_init_prio_data(bfqq, RQ_BIC(rq));

3791

-+

3792

-+	bfq_add_rq_rb(rq);

3793

-+

3794

-+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

3795

-+	list_add_tail(&rq->queuelist, &bfqq->fifo);

3796

-+

3797

-+	bfq_rq_enqueued(bfqd, bfqq, rq);

3798

-+}

3799

-+

3800

-+static void bfq_update_hw_tag(struct bfq_data *bfqd)

3801

-+{

3802

-+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

3803

-+				     bfqd->rq_in_driver);

3804

-+

3805

-+	if (bfqd->hw_tag == 1)

3806

-+		return;

3807

-+

3808

-+	/*

3809

-+	 * This sample is valid if the number of outstanding requests

3810

-+	 * is large enough to allow a queueing behavior.  Note that the

3811

-+	 * sum is not exact, as it's not taking into account deactivated

3812

-+	 * requests.

3813

-+	 */

3814

-+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

3815

-+		return;

3816

-+

3817

-+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

3818

-+		return;

3819

-+

3820

-+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

3821

-+	bfqd->max_rq_in_driver = 0;

3822

-+	bfqd->hw_tag_samples = 0;

3823

-+}

3824

-+

3825

-+static void bfq_completed_request(struct request_queue *q, struct request *rq)

3826

-+{

3827

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3828

-+	struct bfq_data *bfqd = bfqq->bfqd;

3829

-+	const int sync = rq_is_sync(rq);

3830

-+

3831

-+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

3832

-+			blk_rq_sectors(rq), sync);

3833

-+

3834

-+	bfq_update_hw_tag(bfqd);

3835

-+

3836

-+	WARN_ON(!bfqd->rq_in_driver);

3837

-+	WARN_ON(!bfqq->dispatched);

3838

-+	bfqd->rq_in_driver--;

3839

-+	bfqq->dispatched--;

3840

-+

3841

-+	if (bfq_bfqq_sync(bfqq))

3842

-+		bfqd->sync_flight--;

3843

-+

3844

-+	if (sync)

3845

-+		RQ_BIC(rq)->ttime.last_end_request = jiffies;

3846

-+

3847

-+	/*

3848

-+	 * The computation of softrt_next_start was scheduled for the next

3849

-+	 * request completion: it is now time to compute it.

3850

-+	 */

3851

-+	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

3852

-+		bfqq->soft_rt_next_start =

3853

-+			bfq_bfqq_softrt_next_start(bfqd, bfqq);

3854

-+

3855

-+	/*

3856

-+	 * If this is the in-service queue, check if it needs to be expired,

3857

-+	 * or if we want to idle in case it has no pending requests.

3858

-+	 */

3859

-+	if (bfqd->in_service_queue == bfqq) {

3860

-+		if (bfq_bfqq_budget_new(bfqq))

3861

-+			bfq_set_budget_timeout(bfqd);

3862

-+

3863

-+		if (bfq_bfqq_must_idle(bfqq)) {

3864

-+			bfq_arm_slice_timer(bfqd);

3865

-+			goto out;

3866

-+		} else if (bfq_may_expire_for_budg_timeout(bfqq))

3867

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3868

-+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

3869

-+			 (bfqq->dispatched == 0 ||

3870

-+			  !bfq_bfqq_must_not_expire(bfqq)))

3871

-+			bfq_bfqq_expire(bfqd, bfqq, 0,

3872

-+					BFQ_BFQQ_NO_MORE_REQUESTS);

3873

-+	}

3874

-+

3875

-+	if (!bfqd->rq_in_driver)

3876

-+		bfq_schedule_dispatch(bfqd);

3877

-+

3878

-+out:

3879

-+	return;

3880

-+}

3881

-+

3882

-+static inline int __bfq_may_queue(struct bfq_queue *bfqq)

3883

-+{

3884

-+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

3885

-+		bfq_clear_bfqq_must_alloc(bfqq);

3886

-+		return ELV_MQUEUE_MUST;

3887

-+	}

3888

-+

3889

-+	return ELV_MQUEUE_MAY;

3890

-+}

3891

-+

3892

-+static int bfq_may_queue(struct request_queue *q, int rw)

3893

-+{

3894

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3895

-+	struct task_struct *tsk = current;

3896

-+	struct bfq_io_cq *bic;

3897

-+	struct bfq_queue *bfqq;

3898

-+

3899

-+	/*

3900

-+	 * Don't force setup of a queue from here, as a call to may_queue

3901

-+	 * does not necessarily imply that a request actually will be queued.

3902

-+	 * So just lookup a possibly existing queue, or return 'may queue'

3903

-+	 * if that fails.

3904

-+	 */

3905

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

3906

-+	if (bic == NULL)

3907

-+		return ELV_MQUEUE_MAY;

3908

-+

3909

-+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

3910

-+	if (bfqq != NULL) {

3911

-+		bfq_init_prio_data(bfqq, bic);

3912

-+

3913

-+		return __bfq_may_queue(bfqq);

3914

-+	}

3915

-+

3916

-+	return ELV_MQUEUE_MAY;

3917

-+}

3918

-+

3919

-+/*

3920

-+ * Queue lock held here.

3921

-+ */

3922

-+static void bfq_put_request(struct request *rq)

3923

-+{

3924

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3925

-+

3926

-+	if (bfqq != NULL) {

3927

-+		const int rw = rq_data_dir(rq);

3928

-+

3929

-+		BUG_ON(!bfqq->allocated[rw]);

3930

-+		bfqq->allocated[rw]--;

3931

-+

3932

-+		rq->elv.priv[0] = NULL;

3933

-+		rq->elv.priv[1] = NULL;

3934

-+

3935

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

3936

-+			     bfqq, atomic_read(&bfqq->ref));

3937

-+		bfq_put_queue(bfqq);

3938

-+	}

3939

-+}

3940

-+

3941

-+static struct bfq_queue *

3942

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

3943

-+		struct bfq_queue *bfqq)

3944

-+{

3945

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

3946

-+		(long unsigned)bfqq->new_bfqq->pid);

3947

-+	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

3948

-+	bfq_mark_bfqq_coop(bfqq->new_bfqq);

3949

-+	bfq_put_queue(bfqq);

3950

-+	return bic_to_bfqq(bic, 1);

3951

-+}

3952

-+

3953

-+/*

3954

-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

3955

-+ * was the last process referring to said bfqq.

3956

-+ */

3957

-+static struct bfq_queue *

3958

-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

3959

-+{

3960

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

3961

-+	if (bfqq_process_refs(bfqq) == 1) {

3962

-+		bfqq->pid = current->pid;

3963

-+		bfq_clear_bfqq_coop(bfqq);

3964

-+		bfq_clear_bfqq_split_coop(bfqq);

3965

-+		return bfqq;

3966

-+	}

3967

-+

3968

-+	bic_set_bfqq(bic, NULL, 1);

3969

-+

3970

-+	bfq_put_cooperator(bfqq);

3971

-+

3972

-+	bfq_put_queue(bfqq);

3973

-+	return NULL;

3974

-+}

3975

-+

3976

-+/*

3977

-+ * Allocate bfq data structures associated with this request.

3978

-+ */

3979

-+static int bfq_set_request(struct request_queue *q, struct request *rq,

3980

-+			   struct bio *bio, gfp_t gfp_mask)

3981

-+{

3982

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3983

-+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

3984

-+	const int rw = rq_data_dir(rq);

3985

-+	const int is_sync = rq_is_sync(rq);

3986

-+	struct bfq_queue *bfqq;

3987

-+	struct bfq_group *bfqg;

3988

-+	unsigned long flags;

3989

-+

3990

-+	might_sleep_if(gfp_mask & __GFP_WAIT);

3991

-+

3992

-+	bfq_changed_ioprio(bic);

3993

-+

3994

-+	spin_lock_irqsave(q->queue_lock, flags);

3995

-+

3996

-+	if (bic == NULL)

3997

-+		goto queue_fail;

3998

-+

3999

-+	bfqg = bfq_bic_update_cgroup(bic);

4000

-+

4001

-+new_queue:

4002

-+	bfqq = bic_to_bfqq(bic, is_sync);

4003

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4004

-+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4005

-+		bic_set_bfqq(bic, bfqq, is_sync);

4006

-+	} else {

4007

-+		/*

4008

-+		 * If the queue was seeky for too long, break it apart.

4009

-+		 */

4010

-+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4011

-+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4012

-+			bfqq = bfq_split_bfqq(bic, bfqq);

4013

-+			if (!bfqq)

4014

-+				goto new_queue;

4015

-+		}

4016

-+

4017

-+		/*

4018

-+		 * Check to see if this queue is scheduled to merge with

4019

-+		 * another closely cooperating queue. The merging of queues

4020

-+		 * happens here as it must be done in process context.

4021

-+		 * The reference on new_bfqq was taken in merge_bfqqs.

4022

-+		 */

4023

-+		if (bfqq->new_bfqq != NULL)

4024

-+			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4025

-+	}

4026

-+

4027

-+	bfqq->allocated[rw]++;

4028

-+	atomic_inc(&bfqq->ref);

4029

-+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4030

-+		     atomic_read(&bfqq->ref));

4031

-+

4032

-+	rq->elv.priv[0] = bic;

4033

-+	rq->elv.priv[1] = bfqq;

4034

-+

4035

-+	spin_unlock_irqrestore(q->queue_lock, flags);

4036

-+

4037

-+	return 0;

4038

-+

4039

-+queue_fail:

4040

-+	bfq_schedule_dispatch(bfqd);

4041

-+	spin_unlock_irqrestore(q->queue_lock, flags);

4042

-+

4043

-+	return 1;

4044

-+}

4045

-+

4046

-+static void bfq_kick_queue(struct work_struct *work)

4047

-+{

4048

-+	struct bfq_data *bfqd =

4049

-+		container_of(work, struct bfq_data, unplug_work);

4050

-+	struct request_queue *q = bfqd->queue;

4051

-+

4052

-+	spin_lock_irq(q->queue_lock);

4053

-+	__blk_run_queue(q);

4054

-+	spin_unlock_irq(q->queue_lock);

4055

-+}

4056

-+

4057

-+/*

4058

-+ * Handler of the expiration of the timer running if the in-service queue

4059

-+ * is idling inside its time slice.

4060

-+ */

4061

-+static void bfq_idle_slice_timer(unsigned long data)

4062

-+{

4063

-+	struct bfq_data *bfqd = (struct bfq_data *)data;

4064

-+	struct bfq_queue *bfqq;

4065

-+	unsigned long flags;

4066

-+	enum bfqq_expiration reason;

4067

-+

4068

-+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4069

-+

4070

-+	bfqq = bfqd->in_service_queue;

4071

-+	/*

4072

-+	 * Theoretical race here: the in-service queue can be NULL or different

4073

-+	 * from the queue that was idling if the timer handler spins on

4074

-+	 * the queue_lock and a new request arrives for the current

4075

-+	 * queue and there is a full dispatch cycle that changes the

4076

-+	 * in-service queue.  This can hardly happen, but in the worst case

4077

-+	 * we just expire a queue too early.

4078

-+	 */

4079

-+	if (bfqq != NULL) {

4080

-+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4081

-+		if (bfq_bfqq_budget_timeout(bfqq))

4082

-+			/*

4083

-+			 * Also here the queue can be safely expired

4084

-+			 * for budget timeout without wasting

4085

-+			 * guarantees

4086

-+			 */

4087

-+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4088

-+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4089

-+			/*

4090

-+			 * The queue may not be empty upon timer expiration,

4091

-+			 * because we may not disable the timer when the first

4092

-+			 * request of the in-service queue arrives during

4093

-+			 * disk idling

4094

-+			 */

4095

-+			reason = BFQ_BFQQ_TOO_IDLE;

4096

-+		else

4097

-+			goto schedule_dispatch;

4098

-+

4099

-+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4100

-+	}

4101

-+

4102

-+schedule_dispatch:

4103

-+	bfq_schedule_dispatch(bfqd);

4104

-+

4105

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4106

-+}

4107

-+

4108

-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4109

-+{

4110

-+	del_timer_sync(&bfqd->idle_slice_timer);

4111

-+	cancel_work_sync(&bfqd->unplug_work);

4112

-+}

4113

-+

4114

-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4115

-+					struct bfq_queue **bfqq_ptr)

4116

-+{

4117

-+	struct bfq_group *root_group = bfqd->root_group;

4118

-+	struct bfq_queue *bfqq = *bfqq_ptr;

4119

-+

4120

-+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4121

-+	if (bfqq != NULL) {

4122

-+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4123

-+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4124

-+			     bfqq, atomic_read(&bfqq->ref));

4125

-+		bfq_put_queue(bfqq);

4126

-+		*bfqq_ptr = NULL;

4127

-+	}

4128

-+}

4129

-+

4130

-+/*

4131

-+ * Release all the bfqg references to its async queues.  If we are

4132

-+ * deallocating the group these queues may still contain requests, so

4133

-+ * we reparent them to the root cgroup (i.e., the only one that will

4134

-+ * exist for sure untill all the requests on a device are gone).

4135

-+ */

4136

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4137

-+{

4138

-+	int i, j;

4139

-+

4140

-+	for (i = 0; i < 2; i++)

4141

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

4142

-+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4143

-+

4144

-+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4145

-+}

4146

-+

4147

-+static void bfq_exit_queue(struct elevator_queue *e)

4148

-+{

4149

-+	struct bfq_data *bfqd = e->elevator_data;

4150

-+	struct request_queue *q = bfqd->queue;

4151

-+	struct bfq_queue *bfqq, *n;

4152

-+

4153

-+	bfq_shutdown_timer_wq(bfqd);

4154

-+

4155

-+	spin_lock_irq(q->queue_lock);

4156

-+

4157

-+	BUG_ON(bfqd->in_service_queue != NULL);

4158

-+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4159

-+		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4160

-+

4161

-+	bfq_disconnect_groups(bfqd);

4162

-+	spin_unlock_irq(q->queue_lock);

4163

-+

4164

-+	bfq_shutdown_timer_wq(bfqd);

4165

-+

4166

-+	synchronize_rcu();

4167

-+

4168

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4169

-+

4170

-+	bfq_free_root_group(bfqd);

4171

-+	kfree(bfqd);

4172

-+}

4173

-+

4174

-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4175

-+{

4176

-+	struct bfq_group *bfqg;

4177

-+	struct bfq_data *bfqd;

4178

-+	struct elevator_queue *eq;

4179

-+

4180

-+	eq = elevator_alloc(q, e);

4181

-+	if (eq == NULL)

4182

-+		return -ENOMEM;

4183

-+

4184

-+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4185

-+	if (bfqd == NULL) {

4186

-+		kobject_put(&eq->kobj);

4187

-+		return -ENOMEM;

4188

-+	}

4189

-+	eq->elevator_data = bfqd;

4190

-+

4191

-+	/*

4192

-+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4193

-+	 * Grab a permanent reference to it, so that the normal code flow

4194

-+	 * will not attempt to free it.

4195

-+	 */

4196

-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

4197

-+	atomic_inc(&bfqd->oom_bfqq.ref);

4198

-+

4199

-+	bfqd->queue = q;

4200

-+

4201

-+	spin_lock_irq(q->queue_lock);

4202

-+	q->elevator = eq;

4203

-+	spin_unlock_irq(q->queue_lock);

4204

-+

4205

-+	bfqg = bfq_alloc_root_group(bfqd, q->node);

4206

-+	if (bfqg == NULL) {

4207

-+		kfree(bfqd);

4208

-+		kobject_put(&eq->kobj);

4209

-+		return -ENOMEM;

4210

-+	}

4211

-+

4212

-+	bfqd->root_group = bfqg;

4213

-+

4214

-+	init_timer(&bfqd->idle_slice_timer);

4215

-+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4216

-+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4217

-+

4218

-+	bfqd->rq_pos_tree = RB_ROOT;

4219

-+

4220

-+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4221

-+

4222

-+	INIT_LIST_HEAD(&bfqd->active_list);

4223

-+	INIT_LIST_HEAD(&bfqd->idle_list);

4224

-+

4225

-+	bfqd->hw_tag = -1;

4226

-+

4227

-+	bfqd->bfq_max_budget = bfq_default_max_budget;

4228

-+

4229

-+	bfqd->bfq_quantum = bfq_quantum;

4230

-+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4231

-+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4232

-+	bfqd->bfq_back_max = bfq_back_max;

4233

-+	bfqd->bfq_back_penalty = bfq_back_penalty;

4234

-+	bfqd->bfq_slice_idle = bfq_slice_idle;

4235

-+	bfqd->bfq_class_idle_last_service = 0;

4236

-+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4237

-+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4238

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4239

-+

4240

-+	bfqd->low_latency = true;

4241

-+

4242

-+	bfqd->bfq_raising_coeff = 20;

4243

-+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

4244

-+	bfqd->bfq_raising_max_time = 0;

4245

-+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

4246

-+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

4247

-+	bfqd->bfq_raising_max_softrt_rate = 7000; /*

4248

-+						   * Approximate rate required

4249

-+						   * to playback or record a

4250

-+						   * high-definition compressed

4251

-+						   * video.

4252

-+						   */

4253

-+	bfqd->raised_busy_queues = 0;

4254

-+

4255

-+	/* Initially estimate the device's peak rate as the reference rate */

4256

-+	if (blk_queue_nonrot(bfqd->queue)) {

4257

-+		bfqd->RT_prod = R_nonrot * T_nonrot;

4258

-+		bfqd->peak_rate = R_nonrot;

4259

-+	} else {

4260

-+		bfqd->RT_prod = R_rot * T_rot;

4261

-+		bfqd->peak_rate = R_rot;

4262

-+	}

4263

-+

4264

-+	return 0;

4265

-+}

4266

-+

4267

-+static void bfq_slab_kill(void)

4268

-+{

4269

-+	if (bfq_pool != NULL)

4270

-+		kmem_cache_destroy(bfq_pool);

4271

-+}

4272

-+

4273

-+static int __init bfq_slab_setup(void)

4274

-+{

4275

-+	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4276

-+	if (bfq_pool == NULL)

4277

-+		return -ENOMEM;

4278

-+	return 0;

4279

-+}

4280

-+

4281

-+static ssize_t bfq_var_show(unsigned int var, char *page)

4282

-+{

4283

-+	return sprintf(page, "%d\n", var);

4284

-+}

4285

-+

4286

-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

4287

-+{

4288

-+	unsigned long new_val;

4289

-+	int ret = kstrtoul(page, 10, &new_val);

4290

-+

4291

-+	if (ret == 0)

4292

-+		*var = new_val;

4293

-+

4294

-+	return count;

4295

-+}

4296

-+

4297

-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

4298

-+{

4299

-+	struct bfq_data *bfqd = e->elevator_data;

4300

-+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

4301

-+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

4302

-+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

4303

-+}

4304

-+

4305

-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4306

-+{

4307

-+	struct bfq_queue *bfqq;

4308

-+	struct bfq_data *bfqd = e->elevator_data;

4309

-+	ssize_t num_char = 0;

4310

-+

4311

-+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4312

-+			    bfqd->queued);

4313

-+

4314

-+	spin_lock_irq(bfqd->queue->queue_lock);

4315

-+

4316

-+	num_char += sprintf(page + num_char, "Active:\n");

4317

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4318

-+	  num_char += sprintf(page + num_char,

4319

-+			      "pid%d: weight %hu, nr_queued %d %d,"

4320

-+			      " dur %d/%u\n",

4321

-+			      bfqq->pid,

4322

-+			      bfqq->entity.weight,

4323

-+			      bfqq->queued[0],

4324

-+			      bfqq->queued[1],

4325

-+			jiffies_to_msecs(jiffies -

4326

-+				bfqq->last_rais_start_finish),

4327

-+			jiffies_to_msecs(bfqq->raising_cur_max_time));

4328

-+	}

4329

-+

4330

-+	num_char += sprintf(page + num_char, "Idle:\n");

4331

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4332

-+			num_char += sprintf(page + num_char,

4333

-+				"pid%d: weight %hu, dur %d/%u\n",

4334

-+				bfqq->pid,

4335

-+				bfqq->entity.weight,

4336

-+				jiffies_to_msecs(jiffies -

4337

-+					bfqq->last_rais_start_finish),

4338

-+				jiffies_to_msecs(bfqq->raising_cur_max_time));

4339

-+	}

4340

-+

4341

-+	spin_unlock_irq(bfqd->queue->queue_lock);

4342

-+

4343

-+	return num_char;

4344

-+}

4345

-+

4346

-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4347

-+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4348

-+{									\

4349

-+	struct bfq_data *bfqd = e->elevator_data;			\

4350

-+	unsigned int __data = __VAR;					\

4351

-+	if (__CONV)							\

4352

-+		__data = jiffies_to_msecs(__data);			\

4353

-+	return bfq_var_show(__data, (page));				\

4354

-+}

4355

-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4356

-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4357

-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4358

-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4359

-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4360

-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4361

-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4362

-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

4363

-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4364

-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4365

-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4366

-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

4367

-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

4368

-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

4369

-+	1);

4370

-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

4371

-+	bfqd->bfq_raising_min_inter_arr_async,

4372

-+	1);

4373

-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

4374

-+	bfqd->bfq_raising_max_softrt_rate, 0);

4375

-+#undef SHOW_FUNCTION

4376

-+

4377

-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4378

-+static ssize_t								\

4379

-+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4380

-+{									\

4381

-+	struct bfq_data *bfqd = e->elevator_data;			\

4382

-+	unsigned long uninitialized_var(__data);			\

4383

-+	int ret = bfq_var_store(&__data, (page), count);		\

4384

-+	if (__data < (MIN))						\

4385

-+		__data = (MIN);						\

4386

-+	else if (__data > (MAX))					\

4387

-+		__data = (MAX);						\

4388

-+	if (__CONV)							\

4389

-+		*(__PTR) = msecs_to_jiffies(__data);			\

4390

-+	else								\

4391

-+		*(__PTR) = __data;					\

4392

-+	return ret;							\

4393

-+}

4394

-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4395

-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4396

-+		INT_MAX, 1);

4397

-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4398

-+		INT_MAX, 1);

4399

-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4400

-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4401

-+		INT_MAX, 0);

4402

-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4403

-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4404

-+		1, INT_MAX, 0);

4405

-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4406

-+		INT_MAX, 1);

4407

-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

4408

-+		INT_MAX, 0);

4409

-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

4410

-+		INT_MAX, 1);

4411

-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

4412

-+		INT_MAX, 1);

4413

-+STORE_FUNCTION(bfq_raising_min_idle_time_store,

4414

-+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

4415

-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

4416

-+		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

4417

-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

4418

-+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

4419

-+#undef STORE_FUNCTION

4420

-+

4421

-+/* do nothing for the moment */

4422

-+static ssize_t bfq_weights_store(struct elevator_queue *e,

4423

-+				    const char *page, size_t count)

4424

-+{

4425

-+	return count;

4426

-+}

4427

-+

4428

-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4429

-+{

4430

-+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4431

-+

4432

-+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4433

-+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4434

-+	else

4435

-+		return bfq_default_max_budget;

4436

-+}

4437

-+

4438

-+static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4439

-+				    const char *page, size_t count)

4440

-+{

4441

-+	struct bfq_data *bfqd = e->elevator_data;

4442

-+	unsigned long uninitialized_var(__data);

4443

-+	int ret = bfq_var_store(&__data, (page), count);

4444

-+

4445

-+	if (__data == 0)

4446

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4447

-+	else {

4448

-+		if (__data > INT_MAX)

4449

-+			__data = INT_MAX;

4450

-+		bfqd->bfq_max_budget = __data;

4451

-+	}

4452

-+

4453

-+	bfqd->bfq_user_max_budget = __data;

4454

-+

4455

-+	return ret;

4456

-+}

4457

-+

4458

-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4459

-+				      const char *page, size_t count)

4460

-+{

4461

-+	struct bfq_data *bfqd = e->elevator_data;

4462

-+	unsigned long uninitialized_var(__data);

4463

-+	int ret = bfq_var_store(&__data, (page), count);

4464

-+

4465

-+	if (__data < 1)

4466

-+		__data = 1;

4467

-+	else if (__data > INT_MAX)

4468

-+		__data = INT_MAX;

4469

-+

4470

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4471

-+	if (bfqd->bfq_user_max_budget == 0)

4472

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4473

-+

4474

-+	return ret;

4475

-+}

4476

-+

4477

-+static ssize_t bfq_low_latency_store(struct elevator_queue *e,

4478

-+				     const char *page, size_t count)

4479

-+{

4480

-+	struct bfq_data *bfqd = e->elevator_data;

4481

-+	unsigned long uninitialized_var(__data);

4482

-+	int ret = bfq_var_store(&__data, (page), count);

4483

-+

4484

-+	if (__data > 1)

4485

-+		__data = 1;

4486

-+	if (__data == 0 && bfqd->low_latency != 0)

4487

-+		bfq_end_raising(bfqd);

4488

-+	bfqd->low_latency = __data;

4489

-+

4490

-+	return ret;

4491

-+}

4492

-+

4493

-+#define BFQ_ATTR(name) \

4494

-+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

4495

-+

4496

-+static struct elv_fs_entry bfq_attrs[] = {

4497

-+	BFQ_ATTR(quantum),

4498

-+	BFQ_ATTR(fifo_expire_sync),

4499

-+	BFQ_ATTR(fifo_expire_async),

4500

-+	BFQ_ATTR(back_seek_max),

4501

-+	BFQ_ATTR(back_seek_penalty),

4502

-+	BFQ_ATTR(slice_idle),

4503

-+	BFQ_ATTR(max_budget),

4504

-+	BFQ_ATTR(max_budget_async_rq),

4505

-+	BFQ_ATTR(timeout_sync),

4506

-+	BFQ_ATTR(timeout_async),

4507

-+	BFQ_ATTR(low_latency),

4508

-+	BFQ_ATTR(raising_coeff),

4509

-+	BFQ_ATTR(raising_max_time),

4510

-+	BFQ_ATTR(raising_rt_max_time),

4511

-+	BFQ_ATTR(raising_min_idle_time),

4512

-+	BFQ_ATTR(raising_min_inter_arr_async),

4513

-+	BFQ_ATTR(raising_max_softrt_rate),

4514

-+	BFQ_ATTR(weights),

4515

-+	__ATTR_NULL

4516

-+};

4517

-+

4518

-+static struct elevator_type iosched_bfq = {

4519

-+	.ops = {

4520

-+		.elevator_merge_fn =		bfq_merge,

4521

-+		.elevator_merged_fn =		bfq_merged_request,

4522

-+		.elevator_merge_req_fn =	bfq_merged_requests,

4523

-+		.elevator_allow_merge_fn =	bfq_allow_merge,

4524

-+		.elevator_dispatch_fn =		bfq_dispatch_requests,

4525

-+		.elevator_add_req_fn =		bfq_insert_request,

4526

-+		.elevator_activate_req_fn =	bfq_activate_request,

4527

-+		.elevator_deactivate_req_fn =	bfq_deactivate_request,

4528

-+		.elevator_completed_req_fn =	bfq_completed_request,

4529

-+		.elevator_former_req_fn =	elv_rb_former_request,

4530

-+		.elevator_latter_req_fn =	elv_rb_latter_request,

4531

-+		.elevator_init_icq_fn =		bfq_init_icq,

4532

-+		.elevator_exit_icq_fn =		bfq_exit_icq,

4533

-+		.elevator_set_req_fn =		bfq_set_request,

4534

-+		.elevator_put_req_fn =		bfq_put_request,

4535

-+		.elevator_may_queue_fn =	bfq_may_queue,

4536

-+		.elevator_init_fn =		bfq_init_queue,

4537

-+		.elevator_exit_fn =		bfq_exit_queue,

4538

-+	},

4539

-+	.icq_size =		sizeof(struct bfq_io_cq),

4540

-+	.icq_align =		__alignof__(struct bfq_io_cq),

4541

-+	.elevator_attrs =	bfq_attrs,

4542

-+	.elevator_name =	"bfq",

4543

-+	.elevator_owner =	THIS_MODULE,

4544

-+};

4545

-+

4546

-+static int __init bfq_init(void)

4547

-+{

4548

-+	/*

4549

-+	 * Can be 0 on HZ < 1000 setups.

4550

-+	 */

4551

-+	if (bfq_slice_idle == 0)

4552

-+		bfq_slice_idle = 1;

4553

-+

4554

-+	if (bfq_timeout_async == 0)

4555

-+		bfq_timeout_async = 1;

4556

-+

4557

-+	if (bfq_slab_setup())

4558

-+		return -ENOMEM;

4559

-+

4560

-+	elv_register(&iosched_bfq);

4561

-+	printk(KERN_INFO "BFQ I/O-scheduler version: v7");

4562

-+

4563

-+	return 0;

4564

-+}

4565

-+

4566

-+static void __exit bfq_exit(void)

4567

-+{

4568

-+	elv_unregister(&iosched_bfq);

4569

-+	bfq_slab_kill();

4570

-+}

4571

-+

4572

-+module_init(bfq_init);

4573

-+module_exit(bfq_exit);

4574

-+

4575

-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

4576

-+MODULE_LICENSE("GPL");

4577

-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

4578

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

4579

-new file mode 100644

4580

-index 0000000..30df81c

4581

---- /dev/null

4582

-+++ b/block/bfq-sched.c

4583

-@@ -0,0 +1,1077 @@

4584

-+/*

4585

-+ * BFQ: Hierarchical B-WF2Q+ scheduler.

4586

-+ *

4587

-+ * Based on ideas and code from CFQ:

4588

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

4589

-+ *

4590

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

4591

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

4592

-+ *

4593

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

4594

-+ */

4595

-+

4596

-+#ifdef CONFIG_CGROUP_BFQIO

4597

-+#define for_each_entity(entity)	\

4598

-+	for (; entity != NULL; entity = entity->parent)

4599

-+

4600

-+#define for_each_entity_safe(entity, parent) \

4601

-+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

4602

-+

4603

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

4604

-+						 int extract,

4605

-+						 struct bfq_data *bfqd);

4606

-+

4607

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4608

-+{

4609

-+	struct bfq_entity *bfqg_entity;

4610

-+	struct bfq_group *bfqg;

4611

-+	struct bfq_sched_data *group_sd;

4612

-+

4613

-+	BUG_ON(next_active == NULL);

4614

-+

4615

-+	group_sd = next_active->sched_data;

4616

-+

4617

-+	bfqg = container_of(group_sd, struct bfq_group, sched_data);

4618

-+	/*

4619

-+	 * bfq_group's my_entity field is not NULL only if the group

4620

-+	 * is not the root group. We must not touch the root entity

4621

-+	 * as it must never become an active entity.

4622

-+	 */

4623

-+	bfqg_entity = bfqg->my_entity;

4624

-+	if (bfqg_entity != NULL)

4625

-+		bfqg_entity->budget = next_active->budget;

4626

-+}

4627

-+

4628

-+static int bfq_update_next_active(struct bfq_sched_data *sd)

4629

-+{

4630

-+	struct bfq_entity *next_active;

4631

-+

4632

-+	if (sd->active_entity != NULL)

4633

-+		/* will update/requeue at the end of service */

4634

-+		return 0;

4635

-+

4636

-+	/*

4637

-+	 * NOTE: this can be improved in many ways, such as returning

4638

-+	 * 1 (and thus propagating upwards the update) only when the

4639

-+	 * budget changes, or caching the bfqq that will be scheduled

4640

-+	 * next from this subtree.  By now we worry more about

4641

-+	 * correctness than about performance...

4642

-+	 */

4643

-+	next_active = bfq_lookup_next_entity(sd, 0, NULL);

4644

-+	sd->next_active = next_active;

4645

-+

4646

-+	if (next_active != NULL)

4647

-+		bfq_update_budget(next_active);

4648

-+

4649

-+	return 1;

4650

-+}

4651

-+

4652

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4653

-+					 struct bfq_entity *entity)

4654

-+{

4655

-+	BUG_ON(sd->next_active != entity);

4656

-+}

4657

-+#else

4658

-+#define for_each_entity(entity)	\

4659

-+	for (; entity != NULL; entity = NULL)

4660

-+

4661

-+#define for_each_entity_safe(entity, parent) \

4662

-+	for (parent = NULL; entity != NULL; entity = parent)

4663

-+

4664

-+static inline int bfq_update_next_active(struct bfq_sched_data *sd)

4665

-+{

4666

-+	return 0;

4667

-+}

4668

-+

4669

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4670

-+					 struct bfq_entity *entity)

4671

-+{

4672

-+}

4673

-+

4674

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4675

-+{

4676

-+}

4677

-+#endif

4678

-+

4679

-+/*

4680

-+ * Shift for timestamp calculations.  This actually limits the maximum

4681

-+ * service allowed in one timestamp delta (small shift values increase it),

4682

-+ * the maximum total weight that can be used for the queues in the system

4683

-+ * (big shift values increase it), and the period of virtual time wraparounds.

4684

-+ */

4685

-+#define WFQ_SERVICE_SHIFT	22

4686

-+

4687

-+/**

4688

-+ * bfq_gt - compare two timestamps.

4689

-+ * @a: first ts.

4690

-+ * @b: second ts.

4691

-+ *

4692

-+ * Return @a > @b, dealing with wrapping correctly.

4693

-+ */

4694

-+static inline int bfq_gt(u64 a, u64 b)

4695

-+{

4696

-+	return (s64)(a - b) > 0;

4697

-+}

4698

-+

4699

-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

4700

-+{

4701

-+	struct bfq_queue *bfqq = NULL;

4702

-+

4703

-+	BUG_ON(entity == NULL);

4704

-+

4705

-+	if (entity->my_sched_data == NULL)

4706

-+		bfqq = container_of(entity, struct bfq_queue, entity);

4707

-+

4708

-+	return bfqq;

4709

-+}

4710

-+

4711

-+

4712

-+/**

4713

-+ * bfq_delta - map service into the virtual time domain.

4714

-+ * @service: amount of service.

4715

-+ * @weight: scale factor (weight of an entity or weight sum).

4716

-+ */

4717

-+static inline u64 bfq_delta(unsigned long service,

4718

-+					unsigned long weight)

4719

-+{

4720

-+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

4721

-+

4722

-+	do_div(d, weight);

4723

-+	return d;

4724

-+}

4725

-+

4726

-+/**

4727

-+ * bfq_calc_finish - assign the finish time to an entity.

4728

-+ * @entity: the entity to act upon.

4729

-+ * @service: the service to be charged to the entity.

4730

-+ */

4731

-+static inline void bfq_calc_finish(struct bfq_entity *entity,

4732

-+				   unsigned long service)

4733

-+{

4734

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4735

-+

4736

-+	BUG_ON(entity->weight == 0);

4737

-+

4738

-+	entity->finish = entity->start +

4739

-+		bfq_delta(service, entity->weight);

4740

-+

4741

-+	if (bfqq != NULL) {

4742

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4743

-+			"calc_finish: serv %lu, w %d",

4744

-+			service, entity->weight);

4745

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4746

-+			"calc_finish: start %llu, finish %llu, delta %llu",

4747

-+			entity->start, entity->finish,

4748

-+			bfq_delta(service, entity->weight));

4749

-+	}

4750

-+}

4751

-+

4752

-+/**

4753

-+ * bfq_entity_of - get an entity from a node.

4754

-+ * @node: the node field of the entity.

4755

-+ *

4756

-+ * Convert a node pointer to the relative entity.  This is used only

4757

-+ * to simplify the logic of some functions and not as the generic

4758

-+ * conversion mechanism because, e.g., in the tree walking functions,

4759

-+ * the check for a %NULL value would be redundant.

4760

-+ */

4761

-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

4762

-+{

4763

-+	struct bfq_entity *entity = NULL;

4764

-+

4765

-+	if (node != NULL)

4766

-+		entity = rb_entry(node, struct bfq_entity, rb_node);

4767

-+

4768

-+	return entity;

4769

-+}

4770

-+

4771

-+/**

4772

-+ * bfq_extract - remove an entity from a tree.

4773

-+ * @root: the tree root.

4774

-+ * @entity: the entity to remove.

4775

-+ */

4776

-+static inline void bfq_extract(struct rb_root *root,

4777

-+			       struct bfq_entity *entity)

4778

-+{

4779

-+	BUG_ON(entity->tree != root);

4780

-+

4781

-+	entity->tree = NULL;

4782

-+	rb_erase(&entity->rb_node, root);

4783

-+}

4784

-+

4785

-+/**

4786

-+ * bfq_idle_extract - extract an entity from the idle tree.

4787

-+ * @st: the service tree of the owning @entity.

4788

-+ * @entity: the entity being removed.

4789

-+ */

4790

-+static void bfq_idle_extract(struct bfq_service_tree *st,

4791

-+			     struct bfq_entity *entity)

4792

-+{

4793

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4794

-+	struct rb_node *next;

4795

-+

4796

-+	BUG_ON(entity->tree != &st->idle);

4797

-+

4798

-+	if (entity == st->first_idle) {

4799

-+		next = rb_next(&entity->rb_node);

4800

-+		st->first_idle = bfq_entity_of(next);

4801

-+	}

4802

-+

4803

-+	if (entity == st->last_idle) {

4804

-+		next = rb_prev(&entity->rb_node);

4805

-+		st->last_idle = bfq_entity_of(next);

4806

-+	}

4807

-+

4808

-+	bfq_extract(&st->idle, entity);

4809

-+

4810

-+	if (bfqq != NULL)

4811

-+		list_del(&bfqq->bfqq_list);

4812

-+}

4813

-+

4814

-+/**

4815

-+ * bfq_insert - generic tree insertion.

4816

-+ * @root: tree root.

4817

-+ * @entity: entity to insert.

4818

-+ *

4819

-+ * This is used for the idle and the active tree, since they are both

4820

-+ * ordered by finish time.

4821

-+ */

4822

-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

4823

-+{

4824

-+	struct bfq_entity *entry;

4825

-+	struct rb_node **node = &root->rb_node;

4826

-+	struct rb_node *parent = NULL;

4827

-+

4828

-+	BUG_ON(entity->tree != NULL);

4829

-+

4830

-+	while (*node != NULL) {

4831

-+		parent = *node;

4832

-+		entry = rb_entry(parent, struct bfq_entity, rb_node);

4833

-+

4834

-+		if (bfq_gt(entry->finish, entity->finish))

4835

-+			node = &parent->rb_left;

4836

-+		else

4837

-+			node = &parent->rb_right;

4838

-+	}

4839

-+

4840

-+	rb_link_node(&entity->rb_node, parent, node);

4841

-+	rb_insert_color(&entity->rb_node, root);

4842

-+

4843

-+	entity->tree = root;

4844

-+}

4845

-+

4846

-+/**

4847

-+ * bfq_update_min - update the min_start field of a entity.

4848

-+ * @entity: the entity to update.

4849

-+ * @node: one of its children.

4850

-+ *

4851

-+ * This function is called when @entity may store an invalid value for

4852

-+ * min_start due to updates to the active tree.  The function  assumes

4853

-+ * that the subtree rooted at @node (which may be its left or its right

4854

-+ * child) has a valid min_start value.

4855

-+ */

4856

-+static inline void bfq_update_min(struct bfq_entity *entity,

4857

-+				  struct rb_node *node)

4858

-+{

4859

-+	struct bfq_entity *child;

4860

-+

4861

-+	if (node != NULL) {

4862

-+		child = rb_entry(node, struct bfq_entity, rb_node);

4863

-+		if (bfq_gt(entity->min_start, child->min_start))

4864

-+			entity->min_start = child->min_start;

4865

-+	}

4866

-+}

4867

-+

4868

-+/**

4869

-+ * bfq_update_active_node - recalculate min_start.

4870

-+ * @node: the node to update.

4871

-+ *

4872

-+ * @node may have changed position or one of its children may have moved,

4873

-+ * this function updates its min_start value.  The left and right subtrees

4874

-+ * are assumed to hold a correct min_start value.

4875

-+ */

4876

-+static inline void bfq_update_active_node(struct rb_node *node)

4877

-+{

4878

-+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

4879

-+

4880

-+	entity->min_start = entity->start;

4881

-+	bfq_update_min(entity, node->rb_right);

4882

-+	bfq_update_min(entity, node->rb_left);

4883

-+}

4884

-+

4885

-+/**

4886

-+ * bfq_update_active_tree - update min_start for the whole active tree.

4887

-+ * @node: the starting node.

4888

-+ *

4889

-+ * @node must be the deepest modified node after an update.  This function

4890

-+ * updates its min_start using the values held by its children, assuming

4891

-+ * that they did not change, and then updates all the nodes that may have

4892

-+ * changed in the path to the root.  The only nodes that may have changed

4893

-+ * are the ones in the path or their siblings.

4894

-+ */

4895

-+static void bfq_update_active_tree(struct rb_node *node)

4896

-+{

4897

-+	struct rb_node *parent;

4898

-+

4899

-+up:

4900

-+	bfq_update_active_node(node);

4901

-+

4902

-+	parent = rb_parent(node);

4903

-+	if (parent == NULL)

4904

-+		return;

4905

-+

4906

-+	if (node == parent->rb_left && parent->rb_right != NULL)

4907

-+		bfq_update_active_node(parent->rb_right);

4908

-+	else if (parent->rb_left != NULL)

4909

-+		bfq_update_active_node(parent->rb_left);

4910

-+

4911

-+	node = parent;

4912

-+	goto up;

4913

-+}

4914

-+

4915

-+/**

4916

-+ * bfq_active_insert - insert an entity in the active tree of its group/device.

4917

-+ * @st: the service tree of the entity.

4918

-+ * @entity: the entity being inserted.

4919

-+ *

4920

-+ * The active tree is ordered by finish time, but an extra key is kept

4921

-+ * per each node, containing the minimum value for the start times of

4922

-+ * its children (and the node itself), so it's possible to search for

4923

-+ * the eligible node with the lowest finish time in logarithmic time.

4924

-+ */

4925

-+static void bfq_active_insert(struct bfq_service_tree *st,

4926

-+			      struct bfq_entity *entity)

4927

-+{

4928

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4929

-+	struct rb_node *node = &entity->rb_node;

4930

-+

4931

-+	bfq_insert(&st->active, entity);

4932

-+

4933

-+	if (node->rb_left != NULL)

4934

-+		node = node->rb_left;

4935

-+	else if (node->rb_right != NULL)

4936

-+		node = node->rb_right;

4937

-+

4938

-+	bfq_update_active_tree(node);

4939

-+

4940

-+	if (bfqq != NULL)

4941

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

4942

-+}

4943

-+

4944

-+/**

4945

-+ * bfq_ioprio_to_weight - calc a weight from an ioprio.

4946

-+ * @ioprio: the ioprio value to convert.

4947

-+ */

4948

-+static unsigned short bfq_ioprio_to_weight(int ioprio)

4949

-+{

4950

-+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

4951

-+	return IOPRIO_BE_NR - ioprio;

4952

-+}

4953

-+

4954

-+/**

4955

-+ * bfq_weight_to_ioprio - calc an ioprio from a weight.

4956

-+ * @weight: the weight value to convert.

4957

-+ *

4958

-+ * To preserve as mush as possible the old only-ioprio user interface,

4959

-+ * 0 is used as an escape ioprio value for weights (numerically) equal or

4960

-+ * larger than IOPRIO_BE_NR

4961

-+ */

4962

-+static unsigned short bfq_weight_to_ioprio(int weight)

4963

-+{

4964

-+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

4965

-+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

4966

-+}

4967

-+

4968

-+static inline void bfq_get_entity(struct bfq_entity *entity)

4969

-+{

4970

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4971

-+	struct bfq_sched_data *sd;

4972

-+

4973

-+	if (bfqq != NULL) {

4974

-+		sd = entity->sched_data;

4975

-+		atomic_inc(&bfqq->ref);

4976

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

4977

-+			     bfqq, atomic_read(&bfqq->ref));

4978

-+	}

4979

-+}

4980

-+

4981

-+/**

4982

-+ * bfq_find_deepest - find the deepest node that an extraction can modify.

4983

-+ * @node: the node being removed.

4984

-+ *

4985

-+ * Do the first step of an extraction in an rb tree, looking for the

4986

-+ * node that will replace @node, and returning the deepest node that

4987

-+ * the following modifications to the tree can touch.  If @node is the

4988

-+ * last node in the tree return %NULL.

4989

-+ */

4990

-+static struct rb_node *bfq_find_deepest(struct rb_node *node)

4991

-+{

4992

-+	struct rb_node *deepest;

4993

-+

4994

-+	if (node->rb_right == NULL && node->rb_left == NULL)

4995

-+		deepest = rb_parent(node);

4996

-+	else if (node->rb_right == NULL)

4997

-+		deepest = node->rb_left;

4998

-+	else if (node->rb_left == NULL)

4999

-+		deepest = node->rb_right;

5000

-+	else {

5001

-+		deepest = rb_next(node);

5002

-+		if (deepest->rb_right != NULL)

5003

-+			deepest = deepest->rb_right;

5004

-+		else if (rb_parent(deepest) != node)

5005

-+			deepest = rb_parent(deepest);

5006

-+	}

5007

-+

5008

-+	return deepest;

5009

-+}

5010

-+

5011

-+/**

5012

-+ * bfq_active_extract - remove an entity from the active tree.

5013

-+ * @st: the service_tree containing the tree.

5014

-+ * @entity: the entity being removed.

5015

-+ */

5016

-+static void bfq_active_extract(struct bfq_service_tree *st,

5017

-+			       struct bfq_entity *entity)

5018

-+{

5019

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5020

-+	struct rb_node *node;

5021

-+

5022

-+	node = bfq_find_deepest(&entity->rb_node);

5023

-+	bfq_extract(&st->active, entity);

5024

-+

5025

-+	if (node != NULL)

5026

-+		bfq_update_active_tree(node);

5027

-+

5028

-+	if (bfqq != NULL)

5029

-+		list_del(&bfqq->bfqq_list);

5030

-+}

5031

-+

5032

-+/**

5033

-+ * bfq_idle_insert - insert an entity into the idle tree.

5034

-+ * @st: the service tree containing the tree.

5035

-+ * @entity: the entity to insert.

5036

-+ */

5037

-+static void bfq_idle_insert(struct bfq_service_tree *st,

5038

-+			    struct bfq_entity *entity)

5039

-+{

5040

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5041

-+	struct bfq_entity *first_idle = st->first_idle;

5042

-+	struct bfq_entity *last_idle = st->last_idle;

5043

-+

5044

-+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5045

-+		st->first_idle = entity;

5046

-+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5047

-+		st->last_idle = entity;

5048

-+

5049

-+	bfq_insert(&st->idle, entity);

5050

-+

5051

-+	if (bfqq != NULL)

5052

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5053

-+}

5054

-+

5055

-+/**

5056

-+ * bfq_forget_entity - remove an entity from the wfq trees.

5057

-+ * @st: the service tree.

5058

-+ * @entity: the entity being removed.

5059

-+ *

5060

-+ * Update the device status and forget everything about @entity, putting

5061

-+ * the device reference to it, if it is a queue.  Entities belonging to

5062

-+ * groups are not refcounted.

5063

-+ */

5064

-+static void bfq_forget_entity(struct bfq_service_tree *st,

5065

-+			      struct bfq_entity *entity)

5066

-+{

5067

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5068

-+	struct bfq_sched_data *sd;

5069

-+

5070

-+	BUG_ON(!entity->on_st);

5071

-+

5072

-+	entity->on_st = 0;

5073

-+	st->wsum -= entity->weight;

5074

-+	if (bfqq != NULL) {

5075

-+		sd = entity->sched_data;

5076

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5077

-+			     bfqq, atomic_read(&bfqq->ref));

5078

-+		bfq_put_queue(bfqq);

5079

-+	}

5080

-+}

5081

-+

5082

-+/**

5083

-+ * bfq_put_idle_entity - release the idle tree ref of an entity.

5084

-+ * @st: service tree for the entity.

5085

-+ * @entity: the entity being released.

5086

-+ */

5087

-+static void bfq_put_idle_entity(struct bfq_service_tree *st,

5088

-+				struct bfq_entity *entity)

5089

-+{

5090

-+	bfq_idle_extract(st, entity);

5091

-+	bfq_forget_entity(st, entity);

5092

-+}

5093

-+

5094

-+/**

5095

-+ * bfq_forget_idle - update the idle tree if necessary.

5096

-+ * @st: the service tree to act upon.

5097

-+ *

5098

-+ * To preserve the global O(log N) complexity we only remove one entry here;

5099

-+ * as the idle tree will not grow indefinitely this can be done safely.

5100

-+ */

5101

-+static void bfq_forget_idle(struct bfq_service_tree *st)

5102

-+{

5103

-+	struct bfq_entity *first_idle = st->first_idle;

5104

-+	struct bfq_entity *last_idle = st->last_idle;

5105

-+

5106

-+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5107

-+	    !bfq_gt(last_idle->finish, st->vtime)) {

5108

-+		/*

5109

-+		 * Forget the whole idle tree, increasing the vtime past

5110

-+		 * the last finish time of idle entities.

5111

-+		 */

5112

-+		st->vtime = last_idle->finish;

5113

-+	}

5114

-+

5115

-+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5116

-+		bfq_put_idle_entity(st, first_idle);

5117

-+}

5118

-+

5119

-+static struct bfq_service_tree *

5120

-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5121

-+			 struct bfq_entity *entity)

5122

-+{

5123

-+	struct bfq_service_tree *new_st = old_st;

5124

-+

5125

-+	if (entity->ioprio_changed) {

5126

-+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5127

-+

5128

-+		BUG_ON(old_st->wsum < entity->weight);

5129

-+		old_st->wsum -= entity->weight;

5130

-+

5131

-+		if (entity->new_weight != entity->orig_weight) {

5132

-+			entity->orig_weight = entity->new_weight;

5133

-+			entity->ioprio =

5134

-+				bfq_weight_to_ioprio(entity->orig_weight);

5135

-+		} else if (entity->new_ioprio != entity->ioprio) {

5136

-+			entity->ioprio = entity->new_ioprio;

5137

-+			entity->orig_weight =

5138

-+					bfq_ioprio_to_weight(entity->ioprio);

5139

-+		} else

5140

-+			entity->new_weight = entity->orig_weight =

5141

-+				bfq_ioprio_to_weight(entity->ioprio);

5142

-+

5143

-+		entity->ioprio_class = entity->new_ioprio_class;

5144

-+		entity->ioprio_changed = 0;

5145

-+

5146

-+		/*

5147

-+		 * NOTE: here we may be changing the weight too early,

5148

-+		 * this will cause unfairness.  The correct approach

5149

-+		 * would have required additional complexity to defer

5150

-+		 * weight changes to the proper time instants (i.e.,

5151

-+		 * when entity->finish <= old_st->vtime).

5152

-+		 */

5153

-+		new_st = bfq_entity_service_tree(entity);

5154

-+		entity->weight = entity->orig_weight *

5155

-+			(bfqq != NULL ? bfqq->raising_coeff : 1);

5156

-+		new_st->wsum += entity->weight;

5157

-+

5158

-+		if (new_st != old_st)

5159

-+			entity->start = new_st->vtime;

5160

-+	}

5161

-+

5162

-+	return new_st;

5163

-+}

5164

-+

5165

-+/**

5166

-+ * bfq_bfqq_served - update the scheduler status after selection for service.

5167

-+ * @bfqq: the queue being served.

5168

-+ * @served: bytes to transfer.

5169

-+ *

5170

-+ * NOTE: this can be optimized, as the timestamps of upper level entities

5171

-+ * are synchronized every time a new bfqq is selected for service.  By now,

5172

-+ * we keep it to better check consistency.

5173

-+ */

5174

-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5175

-+{

5176

-+	struct bfq_entity *entity = &bfqq->entity;

5177

-+	struct bfq_service_tree *st;

5178

-+

5179

-+	for_each_entity(entity) {

5180

-+		st = bfq_entity_service_tree(entity);

5181

-+

5182

-+		entity->service += served;

5183

-+		BUG_ON(entity->service > entity->budget);

5184

-+		BUG_ON(st->wsum == 0);

5185

-+

5186

-+		st->vtime += bfq_delta(served, st->wsum);

5187

-+		bfq_forget_idle(st);

5188

-+	}

5189

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5190

-+}

5191

-+

5192

-+/**

5193

-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5194

-+ * @bfqq: the queue that needs a service update.

5195

-+ *

5196

-+ * When it's not possible to be fair in the service domain, because

5197

-+ * a queue is not consuming its budget fast enough (the meaning of

5198

-+ * fast depends on the timeout parameter), we charge it a full

5199

-+ * budget.  In this way we should obtain a sort of time-domain

5200

-+ * fairness among all the seeky/slow queues.

5201

-+ */

5202

-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5203

-+{

5204

-+	struct bfq_entity *entity = &bfqq->entity;

5205

-+

5206

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5207

-+

5208

-+	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5209

-+}

5210

-+

5211

-+/**

5212

-+ * __bfq_activate_entity - activate an entity.

5213

-+ * @entity: the entity being activated.

5214

-+ *

5215

-+ * Called whenever an entity is activated, i.e., it is not active and one

5216

-+ * of its children receives a new request, or has to be reactivated due to

5217

-+ * budget exhaustion.  It uses the current budget of the entity (and the

5218

-+ * service received if @entity is active) of the queue to calculate its

5219

-+ * timestamps.

5220

-+ */

5221

-+static void __bfq_activate_entity(struct bfq_entity *entity)

5222

-+{

5223

-+	struct bfq_sched_data *sd = entity->sched_data;

5224

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5225

-+

5226

-+	if (entity == sd->active_entity) {

5227

-+		BUG_ON(entity->tree != NULL);

5228

-+		/*

5229

-+		 * If we are requeueing the current entity we have

5230

-+		 * to take care of not charging to it service it has

5231

-+		 * not received.

5232

-+		 */

5233

-+		bfq_calc_finish(entity, entity->service);

5234

-+		entity->start = entity->finish;

5235

-+		sd->active_entity = NULL;

5236

-+	} else if (entity->tree == &st->active) {

5237

-+		/*

5238

-+		 * Requeueing an entity due to a change of some

5239

-+		 * next_active entity below it.  We reuse the old

5240

-+		 * start time.

5241

-+		 */

5242

-+		bfq_active_extract(st, entity);

5243

-+	} else if (entity->tree == &st->idle) {

5244

-+		/*

5245

-+		 * Must be on the idle tree, bfq_idle_extract() will

5246

-+		 * check for that.

5247

-+		 */

5248

-+		bfq_idle_extract(st, entity);

5249

-+		entity->start = bfq_gt(st->vtime, entity->finish) ?

5250

-+				       st->vtime : entity->finish;

5251

-+	} else {

5252

-+		/*

5253

-+		 * The finish time of the entity may be invalid, and

5254

-+		 * it is in the past for sure, otherwise the queue

5255

-+		 * would have been on the idle tree.

5256

-+		 */

5257

-+		entity->start = st->vtime;

5258

-+		st->wsum += entity->weight;

5259

-+		bfq_get_entity(entity);

5260

-+

5261

-+		BUG_ON(entity->on_st);

5262

-+		entity->on_st = 1;

5263

-+	}

5264

-+

5265

-+	st = __bfq_entity_update_weight_prio(st, entity);

5266

-+	bfq_calc_finish(entity, entity->budget);

5267

-+	bfq_active_insert(st, entity);

5268

-+}

5269

-+

5270

-+/**

5271

-+ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5272

-+ * @entity: the entity to activate.

5273

-+ *

5274

-+ * Activate @entity and all the entities on the path from it to the root.

5275

-+ */

5276

-+static void bfq_activate_entity(struct bfq_entity *entity)

5277

-+{

5278

-+	struct bfq_sched_data *sd;

5279

-+

5280

-+	for_each_entity(entity) {

5281

-+		__bfq_activate_entity(entity);

5282

-+

5283

-+		sd = entity->sched_data;

5284

-+		if (!bfq_update_next_active(sd))

5285

-+			/*

5286

-+			 * No need to propagate the activation to the

5287

-+			 * upper entities, as they will be updated when

5288

-+			 * the active entity is rescheduled.

5289

-+			 */

5290

-+			break;

5291

-+	}

5292

-+}

5293

-+

5294

-+/**

5295

-+ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5296

-+ * @entity: the entity to deactivate.

5297

-+ * @requeue: if false, the entity will not be put into the idle tree.

5298

-+ *

5299

-+ * Deactivate an entity, independently from its previous state.  If the

5300

-+ * entity was not on a service tree just return, otherwise if it is on

5301

-+ * any scheduler tree, extract it from that tree, and if necessary

5302

-+ * and if the caller did not specify @requeue, put it on the idle tree.

5303

-+ *

5304

-+ * Return %1 if the caller should update the entity hierarchy, i.e.,

5305

-+ * if the entity was under service or if it was the next_active for

5306

-+ * its sched_data; return %0 otherwise.

5307

-+ */

5308

-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5309

-+{

5310

-+	struct bfq_sched_data *sd = entity->sched_data;

5311

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5312

-+	int was_active = entity == sd->active_entity;

5313

-+	int ret = 0;

5314

-+

5315

-+	if (!entity->on_st)

5316

-+		return 0;

5317

-+

5318

-+	BUG_ON(was_active && entity->tree != NULL);

5319

-+

5320

-+	if (was_active) {

5321

-+		bfq_calc_finish(entity, entity->service);

5322

-+		sd->active_entity = NULL;

5323

-+	} else if (entity->tree == &st->active)

5324

-+		bfq_active_extract(st, entity);

5325

-+	else if (entity->tree == &st->idle)

5326

-+		bfq_idle_extract(st, entity);

5327

-+	else if (entity->tree != NULL)

5328

-+		BUG();

5329

-+

5330

-+	if (was_active || sd->next_active == entity)

5331

-+		ret = bfq_update_next_active(sd);

5332

-+

5333

-+	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5334

-+		bfq_forget_entity(st, entity);

5335

-+	else

5336

-+		bfq_idle_insert(st, entity);

5337

-+

5338

-+	BUG_ON(sd->active_entity == entity);

5339

-+	BUG_ON(sd->next_active == entity);

5340

-+

5341

-+	return ret;

5342

-+}

5343

-+

5344

-+/**

5345

-+ * bfq_deactivate_entity - deactivate an entity.

5346

-+ * @entity: the entity to deactivate.

5347

-+ * @requeue: true if the entity can be put on the idle tree

5348

-+ */

5349

-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5350

-+{

5351

-+	struct bfq_sched_data *sd;

5352

-+	struct bfq_entity *parent;

5353

-+

5354

-+	for_each_entity_safe(entity, parent) {

5355

-+		sd = entity->sched_data;

5356

-+

5357

-+		if (!__bfq_deactivate_entity(entity, requeue))

5358

-+			/*

5359

-+			 * The parent entity is still backlogged, and

5360

-+			 * we don't need to update it as it is still

5361

-+			 * under service.

5362

-+			 */

5363

-+			break;

5364

-+

5365

-+		if (sd->next_active != NULL)

5366

-+			/*

5367

-+			 * The parent entity is still backlogged and

5368

-+			 * the budgets on the path towards the root

5369

-+			 * need to be updated.

5370

-+			 */

5371

-+			goto update;

5372

-+

5373

-+		/*

5374

-+		 * If we reach there the parent is no more backlogged and

5375

-+		 * we want to propagate the dequeue upwards.

5376

-+		 */

5377

-+		requeue = 1;

5378

-+	}

5379

-+

5380

-+	return;

5381

-+

5382

-+update:

5383

-+	entity = parent;

5384

-+	for_each_entity(entity) {

5385

-+		__bfq_activate_entity(entity);

5386

-+

5387

-+		sd = entity->sched_data;

5388

-+		if (!bfq_update_next_active(sd))

5389

-+			break;

5390

-+	}

5391

-+}

5392

-+

5393

-+/**

5394

-+ * bfq_update_vtime - update vtime if necessary.

5395

-+ * @st: the service tree to act upon.

5396

-+ *

5397

-+ * If necessary update the service tree vtime to have at least one

5398

-+ * eligible entity, skipping to its start time.  Assumes that the

5399

-+ * active tree of the device is not empty.

5400

-+ *

5401

-+ * NOTE: this hierarchical implementation updates vtimes quite often,

5402

-+ * we may end up with reactivated tasks getting timestamps after a

5403

-+ * vtime skip done because we needed a ->first_active entity on some

5404

-+ * intermediate node.

5405

-+ */

5406

-+static void bfq_update_vtime(struct bfq_service_tree *st)

5407

-+{

5408

-+	struct bfq_entity *entry;

5409

-+	struct rb_node *node = st->active.rb_node;

5410

-+

5411

-+	entry = rb_entry(node, struct bfq_entity, rb_node);

5412

-+	if (bfq_gt(entry->min_start, st->vtime)) {

5413

-+		st->vtime = entry->min_start;

5414

-+		bfq_forget_idle(st);

5415

-+	}

5416

-+}

5417

-+

5418

-+/**

5419

-+ * bfq_first_active - find the eligible entity with the smallest finish time

5420

-+ * @st: the service tree to select from.

5421

-+ *

5422

-+ * This function searches the first schedulable entity, starting from the

5423

-+ * root of the tree and going on the left every time on this side there is

5424

-+ * a subtree with at least one eligible (start >= vtime) entity.  The path

5425

-+ * on the right is followed only if a) the left subtree contains no eligible

5426

-+ * entities and b) no eligible entity has been found yet.

5427

-+ */

5428

-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

5429

-+{

5430

-+	struct bfq_entity *entry, *first = NULL;

5431

-+	struct rb_node *node = st->active.rb_node;

5432

-+

5433

-+	while (node != NULL) {

5434

-+		entry = rb_entry(node, struct bfq_entity, rb_node);

5435

-+left:

5436

-+		if (!bfq_gt(entry->start, st->vtime))

5437

-+			first = entry;

5438

-+

5439

-+		BUG_ON(bfq_gt(entry->min_start, st->vtime));

5440

-+

5441

-+		if (node->rb_left != NULL) {

5442

-+			entry = rb_entry(node->rb_left,

5443

-+					 struct bfq_entity, rb_node);

5444

-+			if (!bfq_gt(entry->min_start, st->vtime)) {

5445

-+				node = node->rb_left;

5446

-+				goto left;

5447

-+			}

5448

-+		}

5449

-+		if (first != NULL)

5450

-+			break;

5451

-+		node = node->rb_right;

5452

-+	}

5453

-+

5454

-+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

5455

-+	return first;

5456

-+}

5457

-+

5458

-+/**

5459

-+ * __bfq_lookup_next_entity - return the first eligible entity in @st.

5460

-+ * @st: the service tree.

5461

-+ *

5462

-+ * Update the virtual time in @st and return the first eligible entity

5463

-+ * it contains.

5464

-+ */

5465

-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

5466

-+						   bool force)

5467

-+{

5468

-+	struct bfq_entity *entity, *new_next_active = NULL;

5469

-+

5470

-+	if (RB_EMPTY_ROOT(&st->active))

5471

-+		return NULL;

5472

-+

5473

-+	bfq_update_vtime(st);

5474

-+	entity = bfq_first_active_entity(st);

5475

-+	BUG_ON(bfq_gt(entity->start, st->vtime));

5476

-+

5477

-+	/*

5478

-+	 * If the chosen entity does not match with the sched_data's

5479

-+	 * next_active and we are forcedly serving the IDLE priority

5480

-+	 * class tree, bubble up budget update.

5481

-+	 */

5482

-+	if (unlikely(force && entity != entity->sched_data->next_active)) {

5483

-+		new_next_active = entity;

5484

-+		for_each_entity(new_next_active)

5485

-+			bfq_update_budget(new_next_active);

5486

-+	}

5487

-+

5488

-+	return entity;

5489

-+}

5490

-+

5491

-+/**

5492

-+ * bfq_lookup_next_entity - return the first eligible entity in @sd.

5493

-+ * @sd: the sched_data.

5494

-+ * @extract: if true the returned entity will be also extracted from @sd.

5495

-+ *

5496

-+ * NOTE: since we cache the next_active entity at each level of the

5497

-+ * hierarchy, the complexity of the lookup can be decreased with

5498

-+ * absolutely no effort just returning the cached next_active value;

5499

-+ * we prefer to do full lookups to test the consistency of * the data

5500

-+ * structures.

5501

-+ */

5502

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5503

-+						 int extract,

5504

-+						 struct bfq_data *bfqd)

5505

-+{

5506

-+	struct bfq_service_tree *st = sd->service_tree;

5507

-+	struct bfq_entity *entity;

5508

-+	int i = 0;

5509

-+

5510

-+	BUG_ON(sd->active_entity != NULL);

5511

-+

5512

-+	if (bfqd != NULL &&

5513

-+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

5514

-+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

5515

-+						  true);

5516

-+		if (entity != NULL) {

5517

-+			i = BFQ_IOPRIO_CLASSES - 1;

5518

-+			bfqd->bfq_class_idle_last_service = jiffies;

5519

-+			sd->next_active = entity;

5520

-+		}

5521

-+	}

5522

-+	for (; i < BFQ_IOPRIO_CLASSES; i++) {

5523

-+		entity = __bfq_lookup_next_entity(st + i, false);

5524

-+		if (entity != NULL) {

5525

-+			if (extract) {

5526

-+				bfq_check_next_active(sd, entity);

5527

-+				bfq_active_extract(st + i, entity);

5528

-+				sd->active_entity = entity;

5529

-+				sd->next_active = NULL;

5530

-+			}

5531

-+			break;

5532

-+		}

5533

-+	}

5534

-+

5535

-+	return entity;

5536

-+}

5537

-+

5538

-+/*

5539

-+ * Get next queue for service.

5540

-+ */

5541

-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

5542

-+{

5543

-+	struct bfq_entity *entity = NULL;

5544

-+	struct bfq_sched_data *sd;

5545

-+	struct bfq_queue *bfqq;

5546

-+

5547

-+	BUG_ON(bfqd->in_service_queue != NULL);

5548

-+

5549

-+	if (bfqd->busy_queues == 0)

5550

-+		return NULL;

5551

-+

5552

-+	sd = &bfqd->root_group->sched_data;

5553

-+	for (; sd != NULL; sd = entity->my_sched_data) {

5554

-+		entity = bfq_lookup_next_entity(sd, 1, bfqd);

5555

-+		BUG_ON(entity == NULL);

5556

-+		entity->service = 0;

5557

-+	}

5558

-+

5559

-+	bfqq = bfq_entity_to_bfqq(entity);

5560

-+	BUG_ON(bfqq == NULL);

5561

-+

5562

-+	return bfqq;

5563

-+}

5564

-+

5565

-+/*

5566

-+ * Forced extraction of the given queue.

5567

-+ */

5568

-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

5569

-+				      struct bfq_queue *bfqq)

5570

-+{

5571

-+	struct bfq_entity *entity;

5572

-+	struct bfq_sched_data *sd;

5573

-+

5574

-+	BUG_ON(bfqd->in_service_queue != NULL);

5575

-+

5576

-+	entity = &bfqq->entity;

5577

-+	/*

5578

-+	 * Bubble up extraction/update from the leaf to the root.

5579

-+	*/

5580

-+	for_each_entity(entity) {

5581

-+		sd = entity->sched_data;

5582

-+		bfq_update_budget(entity);

5583

-+		bfq_update_vtime(bfq_entity_service_tree(entity));

5584

-+		bfq_active_extract(bfq_entity_service_tree(entity), entity);

5585

-+		sd->active_entity = entity;

5586

-+		sd->next_active = NULL;

5587

-+		entity->service = 0;

5588

-+	}

5589

-+

5590

-+	return;

5591

-+}

5592

-+

5593

-+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

5594

-+{

5595

-+	if (bfqd->in_service_bic != NULL) {

5596

-+		put_io_context(bfqd->in_service_bic->icq.ioc);

5597

-+		bfqd->in_service_bic = NULL;

5598

-+	}

5599

-+

5600

-+	bfqd->in_service_queue = NULL;

5601

-+	del_timer(&bfqd->idle_slice_timer);

5602

-+}

5603

-+

5604

-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5605

-+				int requeue)

5606

-+{

5607

-+	struct bfq_entity *entity = &bfqq->entity;

5608

-+

5609

-+	if (bfqq == bfqd->in_service_queue)

5610

-+		__bfq_bfqd_reset_in_service(bfqd);

5611

-+

5612

-+	bfq_deactivate_entity(entity, requeue);

5613

-+}

5614

-+

5615

-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5616

-+{

5617

-+	struct bfq_entity *entity = &bfqq->entity;

5618

-+

5619

-+	bfq_activate_entity(entity);

5620

-+}

5621

-+

5622

-+/*

5623

-+ * Called when the bfqq no longer has requests pending, remove it from

5624

-+ * the service tree.

5625

-+ */

5626

-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5627

-+			      int requeue)

5628

-+{

5629

-+	BUG_ON(!bfq_bfqq_busy(bfqq));

5630

-+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

5631

-+

5632

-+	bfq_log_bfqq(bfqd, bfqq, "del from busy");

5633

-+

5634

-+	bfq_clear_bfqq_busy(bfqq);

5635

-+

5636

-+	BUG_ON(bfqd->busy_queues == 0);

5637

-+	bfqd->busy_queues--;

5638

-+	if (bfqq->raising_coeff > 1)

5639

-+		bfqd->raised_busy_queues--;

5640

-+

5641

-+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

5642

-+}

5643

-+

5644

-+/*

5645

-+ * Called when an inactive queue receives a new request.

5646

-+ */

5647

-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5648

-+{

5649

-+	BUG_ON(bfq_bfqq_busy(bfqq));

5650

-+	BUG_ON(bfqq == bfqd->in_service_queue);

5651

-+

5652

-+	bfq_log_bfqq(bfqd, bfqq, "add to busy");

5653

-+

5654

-+	bfq_activate_bfqq(bfqd, bfqq);

5655

-+

5656

-+	bfq_mark_bfqq_busy(bfqq);

5657

-+	bfqd->busy_queues++;

5658

-+	if (bfqq->raising_coeff > 1)

5659

-+		bfqd->raised_busy_queues++;

5660

-+}

5661

-diff --git a/block/bfq.h b/block/bfq.h

5662

-new file mode 100644

5663

-index 0000000..68b28e3

5664

---- /dev/null

5665

-+++ b/block/bfq.h

5666

-@@ -0,0 +1,614 @@

5667

-+/*

5668

-+ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.

5669

-+ *

5670

-+ * Based on ideas and code from CFQ:

5671

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5672

-+ *

5673

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5674

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

5675

-+ *

5676

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5677

-+ */

5678

-+

5679

-+#ifndef _BFQ_H

5680

-+#define _BFQ_H

5681

-+

5682

-+#include <linux/blktrace_api.h>

5683

-+#include <linux/hrtimer.h>

5684

-+#include <linux/ioprio.h>

5685

-+#include <linux/rbtree.h>

5686

-+

5687

-+#define BFQ_IOPRIO_CLASSES	3

5688

-+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

5689

-+

5690

-+#define BFQ_MIN_WEIGHT	1

5691

-+#define BFQ_MAX_WEIGHT	1000

5692

-+

5693

-+#define BFQ_DEFAULT_GRP_WEIGHT	10

5694

-+#define BFQ_DEFAULT_GRP_IOPRIO	0

5695

-+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

5696

-+

5697

-+struct bfq_entity;

5698

-+

5699

-+/**

5700

-+ * struct bfq_service_tree - per ioprio_class service tree.

5701

-+ * @active: tree for active entities (i.e., those backlogged).

5702

-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

5703

-+ * @first_idle: idle entity with minimum F_i.

5704

-+ * @last_idle: idle entity with maximum F_i.

5705

-+ * @vtime: scheduler virtual time.

5706

-+ * @wsum: scheduler weight sum; active and idle entities contribute to it.

5707

-+ *

5708

-+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

5709

-+ * ioprio_class has its own independent scheduler, and so its own

5710

-+ * bfq_service_tree.  All the fields are protected by the queue lock

5711

-+ * of the containing bfqd.

5712

-+ */

5713

-+struct bfq_service_tree {

5714

-+	struct rb_root active;

5715

-+	struct rb_root idle;

5716

-+

5717

-+	struct bfq_entity *first_idle;

5718

-+	struct bfq_entity *last_idle;

5719

-+

5720

-+	u64 vtime;

5721

-+	unsigned long wsum;

5722

-+};

5723

-+

5724

-+/**

5725

-+ * struct bfq_sched_data - multi-class scheduler.

5726

-+ * @active_entity: entity under service.

5727

-+ * @next_active: head-of-the-line entity in the scheduler.

5728

-+ * @service_tree: array of service trees, one per ioprio_class.

5729

-+ *

5730

-+ * bfq_sched_data is the basic scheduler queue.  It supports three

5731

-+ * ioprio_classes, and can be used either as a toplevel queue or as

5732

-+ * an intermediate queue on a hierarchical setup.

5733

-+ * @next_active points to the active entity of the sched_data service

5734

-+ * trees that will be scheduled next.

5735

-+ *

5736

-+ * The supported ioprio_classes are the same as in CFQ, in descending

5737

-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

5738

-+ * Requests from higher priority queues are served before all the

5739

-+ * requests from lower priority queues; among requests of the same

5740

-+ * queue requests are served according to B-WF2Q+.

5741

-+ * All the fields are protected by the queue lock of the containing bfqd.

5742

-+ */

5743

-+struct bfq_sched_data {

5744

-+	struct bfq_entity *active_entity;

5745

-+	struct bfq_entity *next_active;

5746

-+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

5747

-+};

5748

-+

5749

-+/**

5750

-+ * struct bfq_entity - schedulable entity.

5751

-+ * @rb_node: service_tree member.

5752

-+ * @on_st: flag, true if the entity is on a tree (either the active or

5753

-+ *         the idle one of its service_tree).

5754

-+ * @finish: B-WF2Q+ finish timestamp (aka F_i).

5755

-+ * @start: B-WF2Q+ start timestamp (aka S_i).

5756

-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

5757

-+ * @min_start: minimum start time of the (active) subtree rooted at

5758

-+ *             this entity; used for O(log N) lookups into active trees.

5759

-+ * @service: service received during the last round of service.

5760

-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

5761

-+ * @weight: weight of the queue

5762

-+ * @parent: parent entity, for hierarchical scheduling.

5763

-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

5764

-+ *                 associated scheduler queue, %NULL on leaf nodes.

5765

-+ * @sched_data: the scheduler queue this entity belongs to.

5766

-+ * @ioprio: the ioprio in use.

5767

-+ * @new_weight: when a weight change is requested, the new weight value.

5768

-+ * @orig_weight: original weight, used to implement weight boosting

5769

-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

5770

-+ * @ioprio_class: the ioprio_class in use.

5771

-+ * @new_ioprio_class: when an ioprio_class change is requested, the new

5772

-+ *                    ioprio_class value.

5773

-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

5774

-+ *                  ioprio_class change.

5775

-+ *

5776

-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

5777

-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

5778

-+ * entity belongs to the sched_data of the parent group in the cgroup

5779

-+ * hierarchy.  Non-leaf entities have also their own sched_data, stored

5780

-+ * in @my_sched_data.

5781

-+ *

5782

-+ * Each entity stores independently its priority values; this would

5783

-+ * allow different weights on different devices, but this

5784

-+ * functionality is not exported to userspace by now.  Priorities and

5785

-+ * weights are updated lazily, first storing the new values into the

5786

-+ * new_* fields, then setting the @ioprio_changed flag.  As soon as

5787

-+ * there is a transition in the entity state that allows the priority

5788

-+ * update to take place the effective and the requested priority

5789

-+ * values are synchronized.

5790

-+ *

5791

-+ * Unless cgroups are used, the weight value is calculated from the

5792

-+ * ioprio to export the same interface as CFQ.  When dealing with

5793

-+ * ``well-behaved'' queues (i.e., queues that do not spend too much

5794

-+ * time to consume their budget and have true sequential behavior, and

5795

-+ * when there are no external factors breaking anticipation) the

5796

-+ * relative weights at each level of the cgroups hierarchy should be

5797

-+ * guaranteed.  All the fields are protected by the queue lock of the

5798

-+ * containing bfqd.

5799

-+ */

5800

-+struct bfq_entity {

5801

-+	struct rb_node rb_node;

5802

-+

5803

-+	int on_st;

5804

-+

5805

-+	u64 finish;

5806

-+	u64 start;

5807

-+

5808

-+	struct rb_root *tree;

5809

-+

5810

-+	u64 min_start;

5811

-+

5812

-+	unsigned long service, budget;

5813

-+	unsigned short weight, new_weight;

5814

-+	unsigned short orig_weight;

5815

-+

5816

-+	struct bfq_entity *parent;

5817

-+

5818

-+	struct bfq_sched_data *my_sched_data;

5819

-+	struct bfq_sched_data *sched_data;

5820

-+

5821

-+	unsigned short ioprio, new_ioprio;

5822

-+	unsigned short ioprio_class, new_ioprio_class;

5823

-+

5824

-+	int ioprio_changed;

5825

-+};

5826

-+

5827

-+struct bfq_group;

5828

-+

5829

-+/**

5830

-+ * struct bfq_queue - leaf schedulable entity.

5831

-+ * @ref: reference counter.

5832

-+ * @bfqd: parent bfq_data.

5833

-+ * @new_bfqq: shared bfq_queue if queue is cooperating with

5834

-+ *           one or more other queues.

5835

-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

5836

-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

5837

-+ * @sort_list: sorted list of pending requests.

5838

-+ * @next_rq: if fifo isn't expired, next request to serve.

5839

-+ * @queued: nr of requests queued in @sort_list.

5840

-+ * @allocated: currently allocated requests.

5841

-+ * @meta_pending: pending metadata requests.

5842

-+ * @fifo: fifo list of requests in sort_list.

5843

-+ * @entity: entity representing this queue in the scheduler.

5844

-+ * @max_budget: maximum budget allowed from the feedback mechanism.

5845

-+ * @budget_timeout: budget expiration (in jiffies).

5846

-+ * @dispatched: number of requests on the dispatch list or inside driver.

5847

-+ * @org_ioprio: saved ioprio during boosted periods.

5848

-+ * @flags: status flags.

5849

-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

5850

-+ * @seek_samples: number of seeks sampled

5851

-+ * @seek_total: sum of the distances of the seeks sampled

5852

-+ * @seek_mean: mean seek distance

5853

-+ * @last_request_pos: position of the last request enqueued

5854

-+ * @pid: pid of the process owning the queue, used for logging purposes.

5855

-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

5856

-+ * @raising_cur_max_time: current max raising time for this queue

5857

-+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

5858

-+ *                      idle to backlogged

5859

-+ * @service_from_backlogged: cumulative service received from the @bfq_queue

5860

-+ *                           since the last transition from idle to backlogged

5861

-+ *

5862

-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context

5863

-+ * or more (if it is an async one).  @cgroup holds a reference to the

5864

-+ * cgroup, to be sure that it does not disappear while a bfqq still

5865

-+ * references it (mostly to avoid races between request issuing and task

5866

-+ * migration followed by cgroup distruction).

5867

-+ * All the fields are protected by the queue lock of the containing bfqd.

5868

-+ */

5869

-+struct bfq_queue {

5870

-+	atomic_t ref;

5871

-+	struct bfq_data *bfqd;

5872

-+

5873

-+	/* fields for cooperating queues handling */

5874

-+	struct bfq_queue *new_bfqq;

5875

-+	struct rb_node pos_node;

5876

-+	struct rb_root *pos_root;

5877

-+

5878

-+	struct rb_root sort_list;

5879

-+	struct request *next_rq;

5880

-+	int queued[2];

5881

-+	int allocated[2];

5882

-+	int meta_pending;

5883

-+	struct list_head fifo;

5884

-+

5885

-+	struct bfq_entity entity;

5886

-+

5887

-+	unsigned long max_budget;

5888

-+	unsigned long budget_timeout;

5889

-+

5890

-+	int dispatched;

5891

-+

5892

-+	unsigned short org_ioprio;

5893

-+

5894

-+	unsigned int flags;

5895

-+

5896

-+	struct list_head bfqq_list;

5897

-+

5898

-+	unsigned int seek_samples;

5899

-+	u64 seek_total;

5900

-+	sector_t seek_mean;

5901

-+	sector_t last_request_pos;

5902

-+

5903

-+	pid_t pid;

5904

-+

5905

-+	/* weight-raising fields */

5906

-+	unsigned int raising_cur_max_time;

5907

-+	unsigned long soft_rt_next_start;

5908

-+	u64 last_rais_start_finish;

5909

-+	unsigned int raising_coeff;

5910

-+	u64 last_idle_bklogged;

5911

-+	unsigned long service_from_backlogged;

5912

-+};

5913

-+

5914

-+/**

5915

-+ * struct bfq_ttime - per process thinktime stats.

5916

-+ * @ttime_total: total process thinktime

5917

-+ * @ttime_samples: number of thinktime samples

5918

-+ * @ttime_mean: average process thinktime

5919

-+ */

5920

-+struct bfq_ttime {

5921

-+	unsigned long last_end_request;

5922

-+

5923

-+	unsigned long ttime_total;

5924

-+	unsigned long ttime_samples;

5925

-+	unsigned long ttime_mean;

5926

-+};

5927

-+

5928

-+/**

5929

-+ * struct bfq_io_cq - per (request_queue, io_context) structure.

5930

-+ * @icq: associated io_cq structure

5931

-+ * @bfqq: array of two process queues, the sync and the async

5932

-+ * @ttime: associated @bfq_ttime struct

5933

-+ */

5934

-+struct bfq_io_cq {

5935

-+	struct io_cq icq; /* must be the first member */

5936

-+	struct bfq_queue *bfqq[2];

5937

-+	struct bfq_ttime ttime;

5938

-+	int ioprio;

5939

-+};

5940

-+

5941

-+/**

5942

-+ * struct bfq_data - per device data structure.

5943

-+ * @queue: request queue for the managed device.

5944

-+ * @root_group: root bfq_group for the device.

5945

-+ * @rq_pos_tree: rbtree sorted by next_request position,

5946

-+ *		used when determining if two or more queues

5947

-+ *		have interleaving requests (see bfq_close_cooperator).

5948

-+ * @busy_queues: number of bfq_queues containing requests (including the

5949

-+ *		 queue under service, even if it is idling).

5950

-+ * @raised_busy_queues: number of weight-raised busy bfq_queues.

5951

-+ * @queued: number of queued requests.

5952

-+ * @rq_in_driver: number of requests dispatched and waiting for completion.

5953

-+ * @sync_flight: number of sync requests in the driver.

5954

-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

5955

-+ *		      completed requests .

5956

-+ * @hw_tag_samples: nr of samples used to calculate hw_tag.

5957

-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

5958

-+ * @budgets_assigned: number of budgets assigned.

5959

-+ * @idle_slice_timer: timer set when idling for the next sequential request

5960

-+ *                    from the queue under service.

5961

-+ * @unplug_work: delayed work to restart dispatching on the request queue.

5962

-+ * @in_service_queue: bfq_queue under service.

5963

-+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

5964

-+ * @last_position: on-disk position of the last served request.

5965

-+ * @last_budget_start: beginning of the last budget.

5966

-+ * @last_idling_start: beginning of the last idle slice.

5967

-+ * @peak_rate: peak transfer rate observed for a budget.

5968

-+ * @peak_rate_samples: number of samples used to calculate @peak_rate.

5969

-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

5970

-+ * @group_list: list of all the bfq_groups active on the device.

5971

-+ * @active_list: list of all the bfq_queues active on the device.

5972

-+ * @idle_list: list of all the bfq_queues idle on the device.

5973

-+ * @bfq_quantum: max number of requests dispatched per dispatch round.

5974

-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

5975

-+ *                   requests are served in fifo order.

5976

-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

5977

-+ * @bfq_back_max: maximum allowed backward seek.

5978

-+ * @bfq_slice_idle: maximum idling time.

5979

-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

5980

-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

5981

-+ *                           async queues.

5982

-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

5983

-+ *               to prevent seeky queues to impose long latencies to well

5984

-+ *               behaved ones (this also implies that seeky queues cannot

5985

-+ *               receive guarantees in the service domain; after a timeout

5986

-+ *               they are charged for the whole allocated budget, to try

5987

-+ *               to preserve a behavior reasonably fair among them, but

5988

-+ *               without service-domain guarantees).

5989

-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

5990

-+ *                            queue is multiplied

5991

-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

5992

-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

5993

-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

5994

-+ *			       may be reactivated for a queue (in jiffies)

5995

-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

5996

-+ *				     after which weight-raising may be

5997

-+ *				     reactivated for an already busy queue

5998

-+ *				     (in jiffies)

5999

-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

6000

-+ *			         sectors per seconds

6001

-+ * @RT_prod: cached value of the product R*T used for computing the maximum

6002

-+ *	     duration of the weight raising automatically

6003

-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

6004

-+ *

6005

-+ * All the fields are protected by the @queue lock.

6006

-+ */

6007

-+struct bfq_data {

6008

-+	struct request_queue *queue;

6009

-+

6010

-+	struct bfq_group *root_group;

6011

-+

6012

-+	struct rb_root rq_pos_tree;

6013

-+

6014

-+	int busy_queues;

6015

-+	int raised_busy_queues;

6016

-+	int queued;

6017

-+	int rq_in_driver;

6018

-+	int sync_flight;

6019

-+

6020

-+	int max_rq_in_driver;

6021

-+	int hw_tag_samples;

6022

-+	int hw_tag;

6023

-+

6024

-+	int budgets_assigned;

6025

-+

6026

-+	struct timer_list idle_slice_timer;

6027

-+	struct work_struct unplug_work;

6028

-+

6029

-+	struct bfq_queue *in_service_queue;

6030

-+	struct bfq_io_cq *in_service_bic;

6031

-+

6032

-+	sector_t last_position;

6033

-+

6034

-+	ktime_t last_budget_start;

6035

-+	ktime_t last_idling_start;

6036

-+	int peak_rate_samples;

6037

-+	u64 peak_rate;

6038

-+	unsigned long bfq_max_budget;

6039

-+

6040

-+	struct hlist_head group_list;

6041

-+	struct list_head active_list;

6042

-+	struct list_head idle_list;

6043

-+

6044

-+	unsigned int bfq_quantum;

6045

-+	unsigned int bfq_fifo_expire[2];

6046

-+	unsigned int bfq_back_penalty;

6047

-+	unsigned int bfq_back_max;

6048

-+	unsigned int bfq_slice_idle;

6049

-+	u64 bfq_class_idle_last_service;

6050

-+

6051

-+	unsigned int bfq_user_max_budget;

6052

-+	unsigned int bfq_max_budget_async_rq;

6053

-+	unsigned int bfq_timeout[2];

6054

-+

6055

-+	bool low_latency;

6056

-+

6057

-+	/* parameters of the low_latency heuristics */

6058

-+	unsigned int bfq_raising_coeff;

6059

-+	unsigned int bfq_raising_max_time;

6060

-+	unsigned int bfq_raising_rt_max_time;

6061

-+	unsigned int bfq_raising_min_idle_time;

6062

-+	unsigned long bfq_raising_min_inter_arr_async;

6063

-+	unsigned int bfq_raising_max_softrt_rate;

6064

-+	u64 RT_prod;

6065

-+

6066

-+	struct bfq_queue oom_bfqq;

6067

-+};

6068

-+

6069

-+enum bfqq_state_flags {

6070

-+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

6071

-+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6072

-+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6073

-+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6074

-+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6075

-+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

6076

-+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6077

-+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6078

-+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6079

-+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6080

-+	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

6081

-+};

6082

-+

6083

-+#define BFQ_BFQQ_FNS(name)						\

6084

-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6085

-+{									\

6086

-+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6087

-+}									\

6088

-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6089

-+{									\

6090

-+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6091

-+}									\

6092

-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6093

-+{									\

6094

-+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6095

-+}

6096

-+

6097

-+BFQ_BFQQ_FNS(busy);

6098

-+BFQ_BFQQ_FNS(wait_request);

6099

-+BFQ_BFQQ_FNS(must_alloc);

6100

-+BFQ_BFQQ_FNS(fifo_expire);

6101

-+BFQ_BFQQ_FNS(idle_window);

6102

-+BFQ_BFQQ_FNS(prio_changed);

6103

-+BFQ_BFQQ_FNS(sync);

6104

-+BFQ_BFQQ_FNS(budget_new);

6105

-+BFQ_BFQQ_FNS(coop);

6106

-+BFQ_BFQQ_FNS(split_coop);

6107

-+BFQ_BFQQ_FNS(softrt_update);

6108

-+#undef BFQ_BFQQ_FNS

6109

-+

6110

-+/* Logging facilities. */

6111

-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6112

-+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6113

-+

6114

-+#define bfq_log(bfqd, fmt, args...) \

6115

-+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6116

-+

6117

-+/* Expiration reasons. */

6118

-+enum bfqq_expiration {

6119

-+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

6120

-+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6121

-+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6122

-+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6123

-+};

6124

-+

6125

-+#ifdef CONFIG_CGROUP_BFQIO

6126

-+/**

6127

-+ * struct bfq_group - per (device, cgroup) data structure.

6128

-+ * @entity: schedulable entity to insert into the parent group sched_data.

6129

-+ * @sched_data: own sched_data, to contain child entities (they may be

6130

-+ *              both bfq_queues and bfq_groups).

6131

-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6132

-+ *              list of the containing cgroup's bfqio_cgroup.

6133

-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6134

-+ *             of the groups active on the same device; used for cleanup.

6135

-+ * @bfqd: the bfq_data for the device this group acts upon.

6136

-+ * @async_bfqq: array of async queues for all the tasks belonging to

6137

-+ *              the group, one queue per ioprio value per ioprio_class,

6138

-+ *              except for the idle class that has only one queue.

6139

-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6140

-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6141

-+ *             to avoid too many special cases during group creation/migration.

6142

-+ *

6143

-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6144

-+ * there is a set of bfq_groups, each one collecting the lower-level

6145

-+ * entities belonging to the group that are acting on the same device.

6146

-+ *

6147

-+ * Locking works as follows:

6148

-+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6149

-+ *      via RCU from its readers.

6150

-+ *    o @bfqd is protected by the queue lock, RCU is used to access it

6151

-+ *      from the readers.

6152

-+ *    o All the other fields are protected by the @bfqd queue lock.

6153

-+ */

6154

-+struct bfq_group {

6155

-+	struct bfq_entity entity;

6156

-+	struct bfq_sched_data sched_data;

6157

-+

6158

-+	struct hlist_node group_node;

6159

-+	struct hlist_node bfqd_node;

6160

-+

6161

-+	void *bfqd;

6162

-+

6163

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6164

-+	struct bfq_queue *async_idle_bfqq;

6165

-+

6166

-+	struct bfq_entity *my_entity;

6167

-+};

6168

-+

6169

-+/**

6170

-+ * struct bfqio_cgroup - bfq cgroup data structure.

6171

-+ * @css: subsystem state for bfq in the containing cgroup.

6172

-+ * @online: flag marked when the subsystem is inserted.

6173

-+ * @weight: cgroup weight.

6174

-+ * @ioprio: cgroup ioprio.

6175

-+ * @ioprio_class: cgroup ioprio_class.

6176

-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

6177

-+ * @group_data: list containing the bfq_group belonging to this cgroup.

6178

-+ *

6179

-+ * @group_data is accessed using RCU, with @lock protecting the updates,

6180

-+ * @ioprio and @ioprio_class are protected by @lock.

6181

-+ */

6182

-+struct bfqio_cgroup {

6183

-+	struct cgroup_subsys_state css;

6184

-+	bool online;

6185

-+

6186

-+	unsigned short weight, ioprio, ioprio_class;

6187

-+

6188

-+	spinlock_t lock;

6189

-+	struct hlist_head group_data;

6190

-+};

6191

-+#else

6192

-+struct bfq_group {

6193

-+	struct bfq_sched_data sched_data;

6194

-+

6195

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6196

-+	struct bfq_queue *async_idle_bfqq;

6197

-+};

6198

-+#endif

6199

-+

6200

-+static inline struct bfq_service_tree *

6201

-+bfq_entity_service_tree(struct bfq_entity *entity)

6202

-+{

6203

-+	struct bfq_sched_data *sched_data = entity->sched_data;

6204

-+	unsigned int idx = entity->ioprio_class - 1;

6205

-+

6206

-+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

6207

-+	BUG_ON(sched_data == NULL);

6208

-+

6209

-+	return sched_data->service_tree + idx;

6210

-+}

6211

-+

6212

-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

6213

-+					    int is_sync)

6214

-+{

6215

-+	return bic->bfqq[!!is_sync];

6216

-+}

6217

-+

6218

-+static inline void bic_set_bfqq(struct bfq_io_cq *bic,

6219

-+				struct bfq_queue *bfqq, int is_sync)

6220

-+{

6221

-+	bic->bfqq[!!is_sync] = bfqq;

6222

-+}

6223

-+

6224

-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

6225

-+{

6226

-+	return bic->icq.q->elevator->elevator_data;

6227

-+}

6228

-+

6229

-+/**

6230

-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

6231

-+ * @ptr: a pointer to a bfqd.

6232

-+ * @flags: storage for the flags to be saved.

6233

-+ *

6234

-+ * This function allows bfqg->bfqd to be protected by the

6235

-+ * queue lock of the bfqd they reference; the pointer is dereferenced

6236

-+ * under RCU, so the storage for bfqd is assured to be safe as long

6237

-+ * as the RCU read side critical section does not end.  After the

6238

-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

6239

-+ * sure that no other writer accessed it.  If we raced with a writer,

6240

-+ * the function returns NULL, with the queue unlocked, otherwise it

6241

-+ * returns the dereferenced pointer, with the queue locked.

6242

-+ */

6243

-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

6244

-+						   unsigned long *flags)

6245

-+{

6246

-+	struct bfq_data *bfqd;

6247

-+

6248

-+	rcu_read_lock();

6249

-+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

6250

-+

6251

-+	if (bfqd != NULL) {

6252

-+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

6253

-+		if (*ptr == bfqd)

6254

-+			goto out;

6255

-+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6256

-+	}

6257

-+

6258

-+	bfqd = NULL;

6259

-+out:

6260

-+	rcu_read_unlock();

6261

-+	return bfqd;

6262

-+}

6263

-+

6264

-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

6265

-+				       unsigned long *flags)

6266

-+{

6267

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6268

-+}

6269

-+

6270

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic);

6271

-+static void bfq_put_queue(struct bfq_queue *bfqq);

6272

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

6273

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

6274

-+				       struct bfq_group *bfqg, int is_sync,

6275

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask);

6276

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

6277

-+					 struct bfq_group *bfqg);

6278

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

6279

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

6280

-+#endif

6281

---

6282

-1.8.5.2

6283

-

6284

6285

Added: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

6286

===================================================================

6287

--- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	                        (rev 0)

6288

+++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	2014-02-07 15:42:35 UTC (rev 2666)

6289

@@ -0,0 +1,6040 @@

6290

+From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001

6291

+From: Paolo Valente <paolo.valente@×××××××.it>

6292

+Date: Thu, 9 May 2013 19:10:02 +0200

6293

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13

6294

+

6295

+Add the BFQ-v7r1 I/O scheduler to 3.13.

6296

+The general structure is borrowed from CFQ, as much of the code for

6297

+handling I/O contexts Over time, several useful features have been

6298

+ported from CFQ as well (details in the changelog in README.BFQ). A

6299

+(bfq_)queue is associated to each task doing I/O on a device, and each

6300

+time a scheduling decision has to be made a queue is selected and served

6301

+until it expires.

6302

+

6303

+    - Slices are given in the service domain: tasks are assigned

6304

+      budgets, measured in number of sectors. Once got the disk, a task

6305

+      must however consume its assigned budget within a configurable

6306

+      maximum time (by default, the maximum possible value of the

6307

+      budgets is automatically computed to comply with this timeout).

6308

+      This allows the desired latency vs "throughput boosting" tradeoff

6309

+      to be set.

6310

+

6311

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

6312

+      using an augmented rb-tree to take eligibility into account while

6313

+      preserving an O(log N) overall complexity.

6314

+

6315

+    - A low-latency tunable is provided; if enabled, both interactive

6316

+      and soft real-time applications are guaranteed a very low latency.

6317

+

6318

+    - Latency guarantees are preserved also in the presence of NCQ.

6319

+

6320

+    - Also with flash-based devices, a high throughput is achieved

6321

+      while still preserving latency guarantees.

6322

+

6323

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

6324

+      cooperating-queue-merging and the preemption mechanisms present

6325

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

6326

+      sequential read pattern, and hence a high throughput, with any

6327

+      set of processes performing interleaved I/O over a contiguous

6328

+      sequence of sectors.

6329

+

6330

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

6331

+      interface.  Since each node has a full scheduler, each group can

6332

+      be assigned its own weight.

6333

+

6334

+    - If the cgroups interface is not used, only I/O priorities can be

6335

+      assigned to processes, with ioprio values mapped to weights

6336

+      with the relation weight = IOPRIO_BE_NR - ioprio.

6337

+

6338

+    - ioprio classes are served in strict priority order, i.e., lower

6339

+      priority queues are not served as long as there are higher

6340

+      priority queues.  Among queues in the same class the bandwidth is

6341

+      distributed in proportion to the weight of each queue. A very

6342

+      thin extra bandwidth is however guaranteed to the Idle class, to

6343

+      prevent it from starving.

6344

+

6345

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

6346

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

6347

+---

6348

+ block/bfq-cgroup.c  |  911 ++++++++++++++

6349

+ block/bfq-ioc.c     |   36 +

6350

+ block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++

6351

+ block/bfq-sched.c   | 1078 +++++++++++++++++

6352

+ block/bfq.h         |  614 ++++++++++

6353

+ 5 files changed, 5937 insertions(+)

6354

+ create mode 100644 block/bfq-cgroup.c

6355

+ create mode 100644 block/bfq-ioc.c

6356

+ create mode 100644 block/bfq-iosched.c

6357

+ create mode 100644 block/bfq-sched.c

6358

+ create mode 100644 block/bfq.h

6359

+

6360

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

6361

+new file mode 100644

6362

+index 0000000..79a288a

6363

+--- /dev/null

6364

++++ b/block/bfq-cgroup.c

6365

+@@ -0,0 +1,911 @@

6366

++/*

6367

++ * BFQ: CGROUPS support.

6368

++ *

6369

++ * Based on ideas and code from CFQ:

6370

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6371

++ *

6372

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6373

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6374

++ *

6375

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6376

++ *

6377

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

6378

++ */

6379

++

6380

++#ifdef CONFIG_CGROUP_BFQIO

6381

++

6382

++static DEFINE_MUTEX(bfqio_mutex);

6383

++

6384

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

6385

++{

6386

++	return bgrp ? !bgrp->online : false;

6387

++}

6388

++

6389

++static struct bfqio_cgroup bfqio_root_cgroup = {

6390

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

6391

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

6392

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

6393

++};

6394

++

6395

++static inline void bfq_init_entity(struct bfq_entity *entity,

6396

++				   struct bfq_group *bfqg)

6397

++{

6398

++	entity->weight = entity->new_weight;

6399

++	entity->orig_weight = entity->new_weight;

6400

++	entity->ioprio = entity->new_ioprio;

6401

++	entity->ioprio_class = entity->new_ioprio_class;

6402

++	entity->parent = bfqg->my_entity;

6403

++	entity->sched_data = &bfqg->sched_data;

6404

++}

6405

++

6406

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

6407

++{

6408

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

6409

++}

6410

++

6411

++/*

6412

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

6413

++ * of bgrp.  Must be called under rcu_read_lock().

6414

++ */

6415

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

6416

++					    struct bfq_data *bfqd)

6417

++{

6418

++	struct bfq_group *bfqg;

6419

++	void *key;

6420

++

6421

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

6422

++		key = rcu_dereference(bfqg->bfqd);

6423

++		if (key == bfqd)

6424

++			return bfqg;

6425

++	}

6426

++

6427

++	return NULL;

6428

++}

6429

++

6430

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

6431

++					 struct bfq_group *bfqg)

6432

++{

6433

++	struct bfq_entity *entity = &bfqg->entity;

6434

++

6435

++	/*

6436

++	 * If the weight of the entity has never been set via the sysfs

6437

++	 * interface, then bgrp->weight == 0. In this case we initialize

6438

++	 * the weight from the current ioprio value. Otherwise, the group

6439

++	 * weight, if set, has priority over the ioprio value.

6440

++	 */

6441

++	if (bgrp->weight == 0) {

6442

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

6443

++		entity->new_ioprio = bgrp->ioprio;

6444

++	} else {

6445

++		entity->new_weight = bgrp->weight;

6446

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

6447

++	}

6448

++	entity->orig_weight = entity->weight = entity->new_weight;

6449

++	entity->ioprio = entity->new_ioprio;

6450

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

6451

++	entity->my_sched_data = &bfqg->sched_data;

6452

++}

6453

++

6454

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

6455

++					struct bfq_group *parent)

6456

++{

6457

++	struct bfq_entity *entity;

6458

++

6459

++	BUG_ON(parent == NULL);

6460

++	BUG_ON(bfqg == NULL);

6461

++

6462

++	entity = &bfqg->entity;

6463

++	entity->parent = parent->my_entity;

6464

++	entity->sched_data = &parent->sched_data;

6465

++}

6466

++

6467

++/**

6468

++ * bfq_group_chain_alloc - allocate a chain of groups.

6469

++ * @bfqd: queue descriptor.

6470

++ * @css: the leaf cgroup_subsys_state this chain starts from.

6471

++ *

6472

++ * Allocate a chain of groups starting from the one belonging to

6473

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

6474

++ * to the root has already an allocated group on @bfqd.

6475

++ */

6476

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

6477

++					       struct cgroup_subsys_state *css)

6478

++{

6479

++	struct bfqio_cgroup *bgrp;

6480

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

6481

++

6482

++	for (; css != NULL; css = css->parent) {

6483

++		bgrp = css_to_bfqio(css);

6484

++

6485

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

6486

++		if (bfqg != NULL) {

6487

++			/*

6488

++			 * All the cgroups in the path from there to the

6489

++			 * root must have a bfq_group for bfqd, so we don't

6490

++			 * need any more allocations.

6491

++			 */

6492

++			break;

6493

++		}

6494

++

6495

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

6496

++		if (bfqg == NULL)

6497

++			goto cleanup;

6498

++

6499

++		bfq_group_init_entity(bgrp, bfqg);

6500

++		bfqg->my_entity = &bfqg->entity;

6501

++

6502

++		if (leaf == NULL) {

6503

++			leaf = bfqg;

6504

++			prev = leaf;

6505

++		} else {

6506

++			bfq_group_set_parent(prev, bfqg);

6507

++			/*

6508

++			 * Build a list of allocated nodes using the bfqd

6509

++			 * filed, that is still unused and will be initialized

6510

++			 * only after the node will be connected.

6511

++			 */

6512

++			prev->bfqd = bfqg;

6513

++			prev = bfqg;

6514

++		}

6515

++	}

6516

++

6517

++	return leaf;

6518

++

6519

++cleanup:

6520

++	while (leaf != NULL) {

6521

++		prev = leaf;

6522

++		leaf = leaf->bfqd;

6523

++		kfree(prev);

6524

++	}

6525

++

6526

++	return NULL;

6527

++}

6528

++

6529

++/**

6530

++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

6531

++ * @bfqd: the queue descriptor.

6532

++ * @css: the leaf cgroup_subsys_state to start from.

6533

++ * @leaf: the leaf group (to be associated to @cgroup).

6534

++ *

6535

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

6536

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

6537

++ * hierarchy that already as a group associated to @bfqd all the nodes

6538

++ * in the path to the root cgroup have one too.

6539

++ *

6540

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

6541

++ * per device) while the bfqio_cgroup lock protects the list of groups

6542

++ * belonging to the same cgroup.

6543

++ */

6544

++static void bfq_group_chain_link(struct bfq_data *bfqd,

6545

++				 struct cgroup_subsys_state *css,

6546

++				 struct bfq_group *leaf)

6547

++{

6548

++	struct bfqio_cgroup *bgrp;

6549

++	struct bfq_group *bfqg, *next, *prev = NULL;

6550

++	unsigned long flags;

6551

++

6552

++	assert_spin_locked(bfqd->queue->queue_lock);

6553

++

6554

++	for (; css != NULL && leaf != NULL; css = css->parent) {

6555

++		bgrp = css_to_bfqio(css);

6556

++		next = leaf->bfqd;

6557

++

6558

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

6559

++		BUG_ON(bfqg != NULL);

6560

++

6561

++		spin_lock_irqsave(&bgrp->lock, flags);

6562

++

6563

++		rcu_assign_pointer(leaf->bfqd, bfqd);

6564

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

6565

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

6566

++

6567

++		spin_unlock_irqrestore(&bgrp->lock, flags);

6568

++

6569

++		prev = leaf;

6570

++		leaf = next;

6571

++	}

6572

++

6573

++	BUG_ON(css == NULL && leaf != NULL);

6574

++	if (css != NULL && prev != NULL) {

6575

++		bgrp = css_to_bfqio(css);

6576

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

6577

++		bfq_group_set_parent(prev, bfqg);

6578

++	}

6579

++}

6580

++

6581

++/**

6582

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

6583

++ * @bfqd: queue descriptor.

6584

++ * @cgroup: cgroup being searched for.

6585

++ *

6586

++ * Return a group associated to @bfqd in @cgroup, allocating one if

6587

++ * necessary.  When a group is returned all the cgroups in the path

6588

++ * to the root have a group associated to @bfqd.

6589

++ *

6590

++ * If the allocation fails, return the root group: this breaks guarantees

6591

++ * but is a safe fallbak.  If this loss becames a problem it can be

6592

++ * mitigated using the equivalent weight (given by the product of the

6593

++ * weights of the groups in the path from @group to the root) in the

6594

++ * root scheduler.

6595

++ *

6596

++ * We allocate all the missing nodes in the path from the leaf cgroup

6597

++ * to the root and we connect the nodes only after all the allocations

6598

++ * have been successful.

6599

++ */

6600

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

6601

++					      struct cgroup_subsys_state *css)

6602

++{

6603

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

6604

++	struct bfq_group *bfqg;

6605

++

6606

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

6607

++	if (bfqg != NULL)

6608

++		return bfqg;

6609

++

6610

++	bfqg = bfq_group_chain_alloc(bfqd, css);

6611

++	if (bfqg != NULL)

6612

++		bfq_group_chain_link(bfqd, css, bfqg);

6613

++	else

6614

++		bfqg = bfqd->root_group;

6615

++

6616

++	return bfqg;

6617

++}

6618

++

6619

++/**

6620

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

6621

++ * @bfqd: queue descriptor.

6622

++ * @bfqq: the queue to move.

6623

++ * @entity: @bfqq's entity.

6624

++ * @bfqg: the group to move to.

6625

++ *

6626

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

6627

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

6628

++ *

6629

++ * Must be called under the queue lock; the cgroup owning @bfqg must

6630

++ * not disappear (by now this just means that we are called under

6631

++ * rcu_read_lock()).

6632

++ */

6633

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6634

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

6635

++{

6636

++	int busy, resume;

6637

++

6638

++	busy = bfq_bfqq_busy(bfqq);

6639

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

6640

++

6641

++	BUG_ON(resume && !entity->on_st);

6642

++	BUG_ON(busy && !resume && entity->on_st &&

6643

++	       bfqq != bfqd->in_service_queue);

6644

++

6645

++	if (busy) {

6646

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

6647

++

6648

++		if (!resume)

6649

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

6650

++		else

6651

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

6652

++	} else if (entity->on_st)

6653

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

6654

++

6655

++	/*

6656

++	 * Here we use a reference to bfqg.  We don't need a refcounter

6657

++	 * as the cgroup reference will not be dropped, so that its

6658

++	 * destroy() callback will not be invoked.

6659

++	 */

6660

++	entity->parent = bfqg->my_entity;

6661

++	entity->sched_data = &bfqg->sched_data;

6662

++

6663

++	if (busy && resume)

6664

++		bfq_activate_bfqq(bfqd, bfqq);

6665

++

6666

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

6667

++		bfq_schedule_dispatch(bfqd);

6668

++}

6669

++

6670

++/**

6671

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

6672

++ * @bfqd: the queue descriptor.

6673

++ * @bic: the bic to move.

6674

++ * @cgroup: the cgroup to move to.

6675

++ *

6676

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

6677

++ * has to make sure that the reference to cgroup is valid across the call.

6678

++ *

6679

++ * NOTE: an alternative approach might have been to store the current

6680

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

6681

++ * time here, at the price of slightly more complex code.

6682

++ */

6683

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

6684

++						struct bfq_io_cq *bic,

6685

++						struct cgroup_subsys_state *css)

6686

++{

6687

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

6688

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

6689

++	struct bfq_entity *entity;

6690

++	struct bfq_group *bfqg;

6691

++	struct bfqio_cgroup *bgrp;

6692

++

6693

++	bgrp = css_to_bfqio(css);

6694

++

6695

++	bfqg = bfq_find_alloc_group(bfqd, css);

6696

++	if (async_bfqq != NULL) {

6697

++		entity = &async_bfqq->entity;

6698

++

6699

++		if (entity->sched_data != &bfqg->sched_data) {

6700

++			bic_set_bfqq(bic, NULL, 0);

6701

++			bfq_log_bfqq(bfqd, async_bfqq,

6702

++				     "bic_change_group: %p %d",

6703

++				     async_bfqq, atomic_read(&async_bfqq->ref));

6704

++			bfq_put_queue(async_bfqq);

6705

++		}

6706

++	}

6707

++

6708

++	if (sync_bfqq != NULL) {

6709

++		entity = &sync_bfqq->entity;

6710

++		if (entity->sched_data != &bfqg->sched_data)

6711

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

6712

++	}

6713

++

6714

++	return bfqg;

6715

++}

6716

++

6717

++/**

6718

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

6719

++ * @bic: the bic being migrated.

6720

++ * @cgroup: the destination cgroup.

6721

++ *

6722

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

6723

++ * moved into its new parent group.

6724

++ */

6725

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

6726

++				  struct cgroup_subsys_state *css)

6727

++{

6728

++	struct bfq_data *bfqd;

6729

++	unsigned long uninitialized_var(flags);

6730

++

6731

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

6732

++				   &flags);

6733

++	if (bfqd != NULL) {

6734

++		__bfq_bic_change_cgroup(bfqd, bic, css);

6735

++		bfq_put_bfqd_unlock(bfqd, &flags);

6736

++	}

6737

++}

6738

++

6739

++/**

6740

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

6741

++ * @bic: the @bic to update.

6742

++ *

6743

++ * Make sure that @bic is enqueued in the cgroup of the current task.

6744

++ * We need this in addition to moving bics during the cgroup attach

6745

++ * phase because the task owning @bic could be at its first disk

6746

++ * access or we may end up in the root cgroup as the result of a

6747

++ * memory allocation failure and here we try to move to the right

6748

++ * group.

6749

++ *

6750

++ * Must be called under the queue lock.  It is safe to use the returned

6751

++ * value even after the rcu_read_unlock() as the migration/destruction

6752

++ * paths act under the queue lock too.  IOW it is impossible to race with

6753

++ * group migration/destruction and end up with an invalid group as:

6754

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

6755

++ *      has started execution, as current holds a reference to it,

6756

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

6757

++ *      migrated to a different cgroup] its attach() callback will have

6758

++ *      taken care of remove all the references to the old cgroup data.

6759

++ */

6760

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

6761

++{

6762

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

6763

++	struct bfq_group *bfqg;

6764

++	struct cgroup_subsys_state *css;

6765

++

6766

++	BUG_ON(bfqd == NULL);

6767

++

6768

++	rcu_read_lock();

6769

++	css = task_css(current, bfqio_subsys_id);

6770

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

6771

++	rcu_read_unlock();

6772

++

6773

++	return bfqg;

6774

++}

6775

++

6776

++/**

6777

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

6778

++ * @st: the service tree being flushed.

6779

++ */

6780

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

6781

++{

6782

++	struct bfq_entity *entity = st->first_idle;

6783

++

6784

++	for (; entity != NULL; entity = st->first_idle)

6785

++		__bfq_deactivate_entity(entity, 0);

6786

++}

6787

++

6788

++/**

6789

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

6790

++ * @bfqd: the device data structure with the root group.

6791

++ * @entity: the entity to move.

6792

++ */

6793

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

6794

++					    struct bfq_entity *entity)

6795

++{

6796

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

6797

++

6798

++	BUG_ON(bfqq == NULL);

6799

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

6800

++	return;

6801

++}

6802

++

6803

++/**

6804

++ * bfq_reparent_active_entities - move to the root group all active entities.

6805

++ * @bfqd: the device data structure with the root group.

6806

++ * @bfqg: the group to move from.

6807

++ * @st: the service tree with the entities.

6808

++ *

6809

++ * Needs queue_lock to be taken and reference to be valid over the call.

6810

++ */

6811

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

6812

++						struct bfq_group *bfqg,

6813

++						struct bfq_service_tree *st)

6814

++{

6815

++	struct rb_root *active = &st->active;

6816

++	struct bfq_entity *entity = NULL;

6817

++

6818

++	if (!RB_EMPTY_ROOT(&st->active))

6819

++		entity = bfq_entity_of(rb_first(active));

6820

++

6821

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

6822

++		bfq_reparent_leaf_entity(bfqd, entity);

6823

++

6824

++	if (bfqg->sched_data.in_service_entity != NULL)

6825

++		bfq_reparent_leaf_entity(bfqd,

6826

++			bfqg->sched_data.in_service_entity);

6827

++

6828

++	return;

6829

++}

6830

++

6831

++/**

6832

++ * bfq_destroy_group - destroy @bfqg.

6833

++ * @bgrp: the bfqio_cgroup containing @bfqg.

6834

++ * @bfqg: the group being destroyed.

6835

++ *

6836

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

6837

++ */

6838

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

6839

++{

6840

++	struct bfq_data *bfqd;

6841

++	struct bfq_service_tree *st;

6842

++	struct bfq_entity *entity = bfqg->my_entity;

6843

++	unsigned long uninitialized_var(flags);

6844

++	int i;

6845

++

6846

++	hlist_del(&bfqg->group_node);

6847

++

6848

++	/*

6849

++	 * Empty all service_trees belonging to this group before deactivating

6850

++	 * the group itself.

6851

++	 */

6852

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

6853

++		st = bfqg->sched_data.service_tree + i;

6854

++

6855

++		/*

6856

++		 * The idle tree may still contain bfq_queues belonging

6857

++		 * to exited task because they never migrated to a different

6858

++		 * cgroup from the one being destroyed now.  Noone else

6859

++		 * can access them so it's safe to act without any lock.

6860

++		 */

6861

++		bfq_flush_idle_tree(st);

6862

++

6863

++		/*

6864

++		 * It may happen that some queues are still active

6865

++		 * (busy) upon group destruction (if the corresponding

6866

++		 * processes have been forced to terminate). We move

6867

++		 * all the leaf entities corresponding to these queues

6868

++		 * to the root_group.

6869

++		 * Also, it may happen that the group has an entity

6870

++		 * under service, which is disconnected from the active

6871

++		 * tree: it must be moved, too.

6872

++		 * There is no need to put the sync queues, as the

6873

++		 * scheduler has taken no reference.

6874

++		 */

6875

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6876

++		if (bfqd != NULL) {

6877

++			bfq_reparent_active_entities(bfqd, bfqg, st);

6878

++			bfq_put_bfqd_unlock(bfqd, &flags);

6879

++		}

6880

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

6881

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

6882

++	}

6883

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

6884

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

6885

++

6886

++	/*

6887

++	 * We may race with device destruction, take extra care when

6888

++	 * dereferencing bfqg->bfqd.

6889

++	 */

6890

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6891

++	if (bfqd != NULL) {

6892

++		hlist_del(&bfqg->bfqd_node);

6893

++		__bfq_deactivate_entity(entity, 0);

6894

++		bfq_put_async_queues(bfqd, bfqg);

6895

++		bfq_put_bfqd_unlock(bfqd, &flags);

6896

++	}

6897

++	BUG_ON(entity->tree != NULL);

6898

++

6899

++	/*

6900

++	 * No need to defer the kfree() to the end of the RCU grace

6901

++	 * period: we are called from the destroy() callback of our

6902

++	 * cgroup, so we can be sure that noone is a) still using

6903

++	 * this cgroup or b) doing lookups in it.

6904

++	 */

6905

++	kfree(bfqg);

6906

++}

6907

++

6908

++static void bfq_end_raising_async(struct bfq_data *bfqd)

6909

++{

6910

++	struct hlist_node *tmp;

6911

++	struct bfq_group *bfqg;

6912

++

6913

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

6914

++		bfq_end_raising_async_queues(bfqd, bfqg);

6915

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

6916

++}

6917

++

6918

++/**

6919

++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

6920

++ * @bfqd: the device descriptor being exited.

6921

++ *

6922

++ * When the device exits we just make sure that no lookup can return

6923

++ * the now unused group structures.  They will be deallocated on cgroup

6924

++ * destruction.

6925

++ */

6926

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

6927

++{

6928

++	struct hlist_node *tmp;

6929

++	struct bfq_group *bfqg;

6930

++

6931

++	bfq_log(bfqd, "disconnect_groups beginning");

6932

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

6933

++		hlist_del(&bfqg->bfqd_node);

6934

++

6935

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

6936

++

6937

++		/*

6938

++		 * Don't remove from the group hash, just set an

6939

++		 * invalid key.  No lookups can race with the

6940

++		 * assignment as bfqd is being destroyed; this

6941

++		 * implies also that new elements cannot be added

6942

++		 * to the list.

6943

++		 */

6944

++		rcu_assign_pointer(bfqg->bfqd, NULL);

6945

++

6946

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

6947

++			bfqg);

6948

++		bfq_put_async_queues(bfqd, bfqg);

6949

++	}

6950

++}

6951

++

6952

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

6953

++{

6954

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

6955

++	struct bfq_group *bfqg = bfqd->root_group;

6956

++

6957

++	bfq_put_async_queues(bfqd, bfqg);

6958

++

6959

++	spin_lock_irq(&bgrp->lock);

6960

++	hlist_del_rcu(&bfqg->group_node);

6961

++	spin_unlock_irq(&bgrp->lock);

6962

++

6963

++	/*

6964

++	 * No need to synchronize_rcu() here: since the device is gone

6965

++	 * there cannot be any read-side access to its root_group.

6966

++	 */

6967

++	kfree(bfqg);

6968

++}

6969

++

6970

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

6971

++{

6972

++	struct bfq_group *bfqg;

6973

++	struct bfqio_cgroup *bgrp;

6974

++	int i;

6975

++

6976

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

6977

++	if (bfqg == NULL)

6978

++		return NULL;

6979

++

6980

++	bfqg->entity.parent = NULL;

6981

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

6982

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

6983

++

6984

++	bgrp = &bfqio_root_cgroup;

6985

++	spin_lock_irq(&bgrp->lock);

6986

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

6987

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

6988

++	spin_unlock_irq(&bgrp->lock);

6989

++

6990

++	return bfqg;

6991

++}

6992

++

6993

++#define SHOW_FUNCTION(__VAR)						\

6994

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

6995

++				       struct cftype *cftype)		\

6996

++{									\

6997

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

6998

++	u64 ret = -ENODEV;						\

6999

++									\

7000

++	mutex_lock(&bfqio_mutex);					\

7001

++	if (bfqio_is_removed(bgrp))					\

7002

++		goto out_unlock;					\

7003

++									\

7004

++	spin_lock_irq(&bgrp->lock);					\

7005

++	ret = bgrp->__VAR;						\

7006

++	spin_unlock_irq(&bgrp->lock);					\

7007

++									\

7008

++out_unlock:								\

7009

++	mutex_unlock(&bfqio_mutex);					\

7010

++	return ret;							\

7011

++}

7012

++

7013

++SHOW_FUNCTION(weight);

7014

++SHOW_FUNCTION(ioprio);

7015

++SHOW_FUNCTION(ioprio_class);

7016

++#undef SHOW_FUNCTION

7017

++

7018

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

7019

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

7020

++					struct cftype *cftype,		\

7021

++					u64 val)			\

7022

++{									\

7023

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

7024

++	struct bfq_group *bfqg;						\

7025

++	int ret = -EINVAL;						\

7026

++									\

7027

++	if (val < (__MIN) || val > (__MAX))				\

7028

++		return ret;						\

7029

++									\

7030

++	ret = -ENODEV;							\

7031

++	mutex_lock(&bfqio_mutex);					\

7032

++	if (bfqio_is_removed(bgrp))					\

7033

++		goto out_unlock;					\

7034

++	ret = 0;							\

7035

++									\

7036

++	spin_lock_irq(&bgrp->lock);					\

7037

++	bgrp->__VAR = (unsigned short)val;				\

7038

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

7039

++		/*							\

7040

++		 * Setting the ioprio_changed flag of the entity        \

7041

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

7042

++		 * the value of the weight to its ioprio mapping.       \

7043

++		 * Set the flag only if necessary.                      \

7044

++		 */                                                     \

7045

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

7046

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

7047

++			smp_wmb();                                      \

7048

++			bfqg->entity.ioprio_changed = 1;                \

7049

++		}							\

7050

++	}								\

7051

++	spin_unlock_irq(&bgrp->lock);					\

7052

++									\

7053

++out_unlock:								\

7054

++	mutex_unlock(&bfqio_mutex);					\

7055

++	return ret;							\

7056

++}

7057

++

7058

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

7059

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

7060

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

7061

++#undef STORE_FUNCTION

7062

++

7063

++static struct cftype bfqio_files[] = {

7064

++	{

7065

++		.name = "weight",

7066

++		.read_u64 = bfqio_cgroup_weight_read,

7067

++		.write_u64 = bfqio_cgroup_weight_write,

7068

++	},

7069

++	{

7070

++		.name = "ioprio",

7071

++		.read_u64 = bfqio_cgroup_ioprio_read,

7072

++		.write_u64 = bfqio_cgroup_ioprio_write,

7073

++	},

7074

++	{

7075

++		.name = "ioprio_class",

7076

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

7077

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

7078

++	},

7079

++	{ },	/* terminate */

7080

++};

7081

++

7082

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

7083

++						*parent_css)

7084

++{

7085

++	struct bfqio_cgroup *bgrp;

7086

++

7087

++	if (parent_css != NULL) {

7088

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

7089

++		if (bgrp == NULL)

7090

++			return ERR_PTR(-ENOMEM);

7091

++	} else

7092

++		bgrp = &bfqio_root_cgroup;

7093

++

7094

++	spin_lock_init(&bgrp->lock);

7095

++	INIT_HLIST_HEAD(&bgrp->group_data);

7096

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

7097

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

7098

++

7099

++	return &bgrp->css;

7100

++}

7101

++

7102

++/*

7103

++ * We cannot support shared io contexts, as we have no means to support

7104

++ * two tasks with the same ioc in two different groups without major rework

7105

++ * of the main bic/bfqq data structures.  By now we allow a task to change

7106

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

7107

++ * behavior is that a group containing a task that forked using CLONE_IO

7108

++ * will not be destroyed until the tasks sharing the ioc die.

7109

++ */

7110

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

7111

++			    struct cgroup_taskset *tset)

7112

++{

7113

++	struct task_struct *task;

7114

++	struct io_context *ioc;

7115

++	int ret = 0;

7116

++

7117

++	cgroup_taskset_for_each(task, css, tset) {

7118

++		/*

7119

++		 * task_lock() is needed to avoid races with

7120

++		 * exit_io_context()

7121

++		 */

7122

++		task_lock(task);

7123

++		ioc = task->io_context;

7124

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

7125

++			/*

7126

++			 * ioc == NULL means that the task is either too young

7127

++			 * or exiting: if it has still no ioc the ioc can't be

7128

++			 * shared, if the task is exiting the attach will fail

7129

++			 * anyway, no matter what we return here.

7130

++			 */

7131

++			ret = -EINVAL;

7132

++		task_unlock(task);

7133

++		if (ret)

7134

++			break;

7135

++	}

7136

++

7137

++	return ret;

7138

++}

7139

++

7140

++static void bfqio_attach(struct cgroup_subsys_state *css,

7141

++			 struct cgroup_taskset *tset)

7142

++{

7143

++	struct task_struct *task;

7144

++	struct io_context *ioc;

7145

++	struct io_cq *icq;

7146

++

7147

++	/*

7148

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

7149

++	 * new group has not yet been tested.

7150

++	 */

7151

++	cgroup_taskset_for_each(task, css, tset) {

7152

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

7153

++		if (ioc) {

7154

++			/*

7155

++			 * Handle cgroup change here.

7156

++			 */

7157

++			rcu_read_lock();

7158

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

7159

++				if (!strncmp(

7160

++					icq->q->elevator->type->elevator_name,

7161

++					"bfq", ELV_NAME_MAX))

7162

++					bfq_bic_change_cgroup(icq_to_bic(icq),

7163

++							      css);

7164

++			rcu_read_unlock();

7165

++			put_io_context(ioc);

7166

++		}

7167

++	}

7168

++}

7169

++

7170

++static void bfqio_destroy(struct cgroup_subsys_state *css)

7171

++{

7172

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7173

++	struct hlist_node *tmp;

7174

++	struct bfq_group *bfqg;

7175

++

7176

++	/*

7177

++	 * Since we are destroying the cgroup, there are no more tasks

7178

++	 * referencing it, and all the RCU grace periods that may have

7179

++	 * referenced it are ended (as the destruction of the parent

7180

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

7181

++	 * anything else and we don't need any synchronization.

7182

++	 */

7183

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

7184

++		bfq_destroy_group(bgrp, bfqg);

7185

++

7186

++	BUG_ON(!hlist_empty(&bgrp->group_data));

7187

++

7188

++	kfree(bgrp);

7189

++}

7190

++

7191

++static int bfqio_css_online(struct cgroup_subsys_state *css)

7192

++{

7193

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7194

++

7195

++	mutex_lock(&bfqio_mutex);

7196

++	bgrp->online = true;

7197

++	mutex_unlock(&bfqio_mutex);

7198

++

7199

++	return 0;

7200

++}

7201

++

7202

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

7203

++{

7204

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7205

++

7206

++	mutex_lock(&bfqio_mutex);

7207

++	bgrp->online = false;

7208

++	mutex_unlock(&bfqio_mutex);

7209

++}

7210

++

7211

++struct cgroup_subsys bfqio_subsys = {

7212

++	.name = "bfqio",

7213

++	.css_alloc = bfqio_create,

7214

++	.css_online = bfqio_css_online,

7215

++	.css_offline = bfqio_css_offline,

7216

++	.can_attach = bfqio_can_attach,

7217

++	.attach = bfqio_attach,

7218

++	.css_free = bfqio_destroy,

7219

++	.subsys_id = bfqio_subsys_id,

7220

++	.base_cftypes = bfqio_files,

7221

++};

7222

++#else

7223

++static inline void bfq_init_entity(struct bfq_entity *entity,

7224

++				   struct bfq_group *bfqg)

7225

++{

7226

++	entity->weight = entity->new_weight;

7227

++	entity->orig_weight = entity->new_weight;

7228

++	entity->ioprio = entity->new_ioprio;

7229

++	entity->ioprio_class = entity->new_ioprio_class;

7230

++	entity->sched_data = &bfqg->sched_data;

7231

++}

7232

++

7233

++static inline struct bfq_group *

7234

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

7235

++{

7236

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

7237

++	return bfqd->root_group;

7238

++}

7239

++

7240

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

7241

++				 struct bfq_queue *bfqq,

7242

++				 struct bfq_entity *entity,

7243

++				 struct bfq_group *bfqg)

7244

++{

7245

++}

7246

++

7247

++static void bfq_end_raising_async(struct bfq_data *bfqd)

7248

++{

7249

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

7250

++}

7251

++

7252

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

7253

++{

7254

++	bfq_put_async_queues(bfqd, bfqd->root_group);

7255

++}

7256

++

7257

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

7258

++{

7259

++	kfree(bfqd->root_group);

7260

++}

7261

++

7262

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

7263

++{

7264

++	struct bfq_group *bfqg;

7265

++	int i;

7266

++

7267

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

7268

++	if (bfqg == NULL)

7269

++		return NULL;

7270

++

7271

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

7272

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

7273

++

7274

++	return bfqg;

7275

++}

7276

++#endif

7277

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

7278

+new file mode 100644

7279

+index 0000000..7f6b000

7280

+--- /dev/null

7281

++++ b/block/bfq-ioc.c

7282

+@@ -0,0 +1,36 @@

7283

++/*

7284

++ * BFQ: I/O context handling.

7285

++ *

7286

++ * Based on ideas and code from CFQ:

7287

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7288

++ *

7289

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7290

++ *		      Paolo Valente <paolo.valente@×××××××.it>

7291

++ *

7292

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7293

++ */

7294

++

7295

++/**

7296

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

7297

++ * @icq: the iocontext queue.

7298

++ */

7299

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

7300

++{

7301

++	/* bic->icq is the first member, %NULL will convert to %NULL */

7302

++	return container_of(icq, struct bfq_io_cq, icq);

7303

++}

7304

++

7305

++/**

7306

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

7307

++ * @bfqd: the lookup key.

7308

++ * @ioc: the io_context of the process doing I/O.

7309

++ *

7310

++ * Queue lock must be held.

7311

++ */

7312

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

7313

++					       struct io_context *ioc)

7314

++{

7315

++	if (ioc)

7316

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

7317

++	return NULL;

7318

++}

7319

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7320

+new file mode 100644

7321

+index 0000000..eb760de

7322

+--- /dev/null

7323

++++ b/block/bfq-iosched.c

7324

+@@ -0,0 +1,3298 @@

7325

++/*

7326

++ * BFQ, or Budget Fair Queueing, disk scheduler.

7327

++ *

7328

++ * Based on ideas and code from CFQ:

7329

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7330

++ *

7331

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7332

++ *		      Paolo Valente <paolo.valente@×××××××.it>

7333

++ *

7334

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7335

++ *

7336

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

7337

++ *

7338

++ * BFQ is a proportional share disk scheduling algorithm based on the

7339

++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

7340

++ * number of sectors, to tasks instead of time slices. The disk is not granted

7341

++ * to the in-service task for a given time slice, but until it has exahusted

7342

++ * its assigned budget. This change from the time to the service domain allows

7343

++ * BFQ to distribute the disk bandwidth among tasks as desired, without any

7344

++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

7345

++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

7346

++ * their budgets (more precisely BFQ schedules queues associated to tasks).

7347

++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

7348

++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

7349

++ * latencies to interactive and soft real-time applications.

7350

++ *

7351

++ * BFQ is described in [1], where also a reference to the initial, more

7352

++ * theoretical paper on BFQ can be found. The interested reader can find in

7353

++ * the latter paper full details on the main algorithm as well as formulas of

7354

++ * the guarantees, plus formal proofs of all the properties. With respect to

7355

++ * the version of BFQ presented in these papers, this implementation adds a

7356

++ * few more heuristics, such as the one that guarantees a low latency to soft

7357

++ * real-time applications, and a hierarchical extension based on H-WF2Q+.

7358

++ *

7359

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

7360

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

7361

++ * complexity derives from the one introduced with EEVDF in [3].

7362

++ *

7363

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

7364

++ *     with the BFQ Disk I/O Scheduler'',

7365

++ *     Proceedings of the 5th Annual International Systems and Storage

7366

++ *     Conference (SYSTOR '12), June 2012.

7367

++ *

7368

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

7369

++ *

7370

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

7371

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

7372

++ *     Oct 1997.

7373

++ *

7374

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

7375

++ *

7376

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

7377

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

7378

++ *     Resource Allocation,'' technical report.

7379

++ *

7380

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

7381

++ */

7382

++#include <linux/module.h>

7383

++#include <linux/slab.h>

7384

++#include <linux/blkdev.h>

7385

++#include <linux/cgroup.h>

7386

++#include <linux/elevator.h>

7387

++#include <linux/jiffies.h>

7388

++#include <linux/rbtree.h>

7389

++#include <linux/ioprio.h>

7390

++#include "bfq.h"

7391

++#include "blk.h"

7392

++

7393

++/* Max number of dispatches in one round of service. */

7394

++static const int bfq_quantum = 4;

7395

++

7396

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

7397

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

7398

++

7399

++/* Maximum backwards seek, in KiB. */

7400

++static const int bfq_back_max = 16 * 1024;

7401

++

7402

++/* Penalty of a backwards seek, in number of sectors. */

7403

++static const int bfq_back_penalty = 2;

7404

++

7405

++/* Idling period duration, in jiffies. */

7406

++static int bfq_slice_idle = HZ / 125;

7407

++

7408

++/* Default maximum budget values, in sectors and number of requests. */

7409

++static const int bfq_default_max_budget = 16 * 1024;

7410

++static const int bfq_max_budget_async_rq = 4;

7411

++

7412

++/*

7413

++ * Async to sync throughput distribution is controlled as follows:

7414

++ * when an async request is served, the entity is charged the number

7415

++ * of sectors of the request, multipled by the factor below

7416

++ */

7417

++static const int bfq_async_charge_factor = 10;

7418

++

7419

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

7420

++static const int bfq_timeout_sync = HZ / 8;

7421

++static int bfq_timeout_async = HZ / 25;

7422

++

7423

++struct kmem_cache *bfq_pool;

7424

++

7425

++/* Below this threshold (in ms), we consider thinktime immediate. */

7426

++#define BFQ_MIN_TT		2

7427

++

7428

++/* hw_tag detection: parallel requests threshold and min samples needed. */

7429

++#define BFQ_HW_QUEUE_THRESHOLD	4

7430

++#define BFQ_HW_QUEUE_SAMPLES	32

7431

++

7432

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

7433

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

7434

++

7435

++/* Min samples used for peak rate estimation (for autotuning). */

7436

++#define BFQ_PEAK_RATE_SAMPLES	32

7437

++

7438

++/* Shift used for peak rate fixed precision calculations. */

7439

++#define BFQ_RATE_SHIFT		16

7440

++

7441

++/*

7442

++ * The duration of the weight raising for interactive applications is

7443

++ * computed automatically (as default behaviour), using the following

7444

++ * formula: duration = (R / r) * T, where r is the peak rate of the

7445

++ * disk, and R and T are two reference parameters. In particular, R is

7446

++ * the peak rate of a reference disk, and T is about the maximum time

7447

++ * for starting popular large applications on that disk, under BFQ and

7448

++ * while reading two files in parallel. Finally, BFQ uses two

7449

++ * different pairs (R, T) depending on whether the disk is rotational

7450

++ * or non-rotational.

7451

++ */

7452

++#define T_rot			(msecs_to_jiffies(5500))

7453

++#define T_nonrot		(msecs_to_jiffies(2000))

7454

++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

7455

++#define R_rot			17415

7456

++#define R_nonrot		34791

7457

++

7458

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

7459

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

7460

++

7461

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

7462

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

7463

++

7464

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

7465

++

7466

++#include "bfq-ioc.c"

7467

++#include "bfq-sched.c"

7468

++#include "bfq-cgroup.c"

7469

++

7470

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

7471

++				 IOPRIO_CLASS_IDLE)

7472

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

7473

++				 IOPRIO_CLASS_RT)

7474

++

7475

++#define bfq_sample_valid(samples)	((samples) > 80)

7476

++

7477

++/*

7478

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

7479

++ * set (in which case it could also be a direct WRITE).

7480

++ */

7481

++static inline int bfq_bio_sync(struct bio *bio)

7482

++{

7483

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

7484

++		return 1;

7485

++

7486

++	return 0;

7487

++}

7488

++

7489

++/*

7490

++ * Scheduler run of queue, if there are requests pending and no one in the

7491

++ * driver that will restart queueing.

7492

++ */

7493

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

7494

++{

7495

++	if (bfqd->queued != 0) {

7496

++		bfq_log(bfqd, "schedule dispatch");

7497

++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

7498

++	}

7499

++}

7500

++

7501

++/*

7502

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

7503

++ * We choose the request that is closesr to the head right now.  Distance

7504

++ * behind the head is penalized and only allowed to a certain extent.

7505

++ */

7506

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

7507

++				      struct request *rq1,

7508

++				      struct request *rq2,

7509

++				      sector_t last)

7510

++{

7511

++	sector_t s1, s2, d1 = 0, d2 = 0;

7512

++	unsigned long back_max;

7513

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

7514

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

7515

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

7516

++

7517

++	if (rq1 == NULL || rq1 == rq2)

7518

++		return rq2;

7519

++	if (rq2 == NULL)

7520

++		return rq1;

7521

++

7522

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

7523

++		return rq1;

7524

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

7525

++		return rq2;

7526

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

7527

++		return rq1;

7528

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

7529

++		return rq2;

7530

++

7531

++	s1 = blk_rq_pos(rq1);

7532

++	s2 = blk_rq_pos(rq2);

7533

++

7534

++	/*

7535

++	 * By definition, 1KiB is 2 sectors.

7536

++	 */

7537

++	back_max = bfqd->bfq_back_max * 2;

7538

++

7539

++	/*

7540

++	 * Strict one way elevator _except_ in the case where we allow

7541

++	 * short backward seeks which are biased as twice the cost of a

7542

++	 * similar forward seek.

7543

++	 */

7544

++	if (s1 >= last)

7545

++		d1 = s1 - last;

7546

++	else if (s1 + back_max >= last)

7547

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

7548

++	else

7549

++		wrap |= BFQ_RQ1_WRAP;

7550

++

7551

++	if (s2 >= last)

7552

++		d2 = s2 - last;

7553

++	else if (s2 + back_max >= last)

7554

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

7555

++	else

7556

++		wrap |= BFQ_RQ2_WRAP;

7557

++

7558

++	/* Found required data */

7559

++

7560

++	/*

7561

++	 * By doing switch() on the bit mask "wrap" we avoid having to

7562

++	 * check two variables for all permutations: --> faster!

7563

++	 */

7564

++	switch (wrap) {

7565

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

7566

++		if (d1 < d2)

7567

++			return rq1;

7568

++		else if (d2 < d1)

7569

++			return rq2;

7570

++		else {

7571

++			if (s1 >= s2)

7572

++				return rq1;

7573

++			else

7574

++				return rq2;

7575

++		}

7576

++

7577

++	case BFQ_RQ2_WRAP:

7578

++		return rq1;

7579

++	case BFQ_RQ1_WRAP:

7580

++		return rq2;

7581

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

7582

++	default:

7583

++		/*

7584

++		 * Since both rqs are wrapped,

7585

++		 * start with the one that's further behind head

7586

++		 * (--> only *one* back seek required),

7587

++		 * since back seek takes more time than forward.

7588

++		 */

7589

++		if (s1 <= s2)

7590

++			return rq1;

7591

++		else

7592

++			return rq2;

7593

++	}

7594

++}

7595

++

7596

++static struct bfq_queue *

7597

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

7598

++		     sector_t sector, struct rb_node **ret_parent,

7599

++		     struct rb_node ***rb_link)

7600

++{

7601

++	struct rb_node **p, *parent;

7602

++	struct bfq_queue *bfqq = NULL;

7603

++

7604

++	parent = NULL;

7605

++	p = &root->rb_node;

7606

++	while (*p) {

7607

++		struct rb_node **n;

7608

++

7609

++		parent = *p;

7610

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7611

++

7612

++		/*

7613

++		 * Sort strictly based on sector. Smallest to the left,

7614

++		 * largest to the right.

7615

++		 */

7616

++		if (sector > blk_rq_pos(bfqq->next_rq))

7617

++			n = &(*p)->rb_right;

7618

++		else if (sector < blk_rq_pos(bfqq->next_rq))

7619

++			n = &(*p)->rb_left;

7620

++		else

7621

++			break;

7622

++		p = n;

7623

++		bfqq = NULL;

7624

++	}

7625

++

7626

++	*ret_parent = parent;

7627

++	if (rb_link)

7628

++		*rb_link = p;

7629

++

7630

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

7631

++		(long long unsigned)sector,

7632

++		bfqq != NULL ? bfqq->pid : 0);

7633

++

7634

++	return bfqq;

7635

++}

7636

++

7637

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7638

++{

7639

++	struct rb_node **p, *parent;

7640

++	struct bfq_queue *__bfqq;

7641

++

7642

++	if (bfqq->pos_root != NULL) {

7643

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

7644

++		bfqq->pos_root = NULL;

7645

++	}

7646

++

7647

++	if (bfq_class_idle(bfqq))

7648

++		return;

7649

++	if (!bfqq->next_rq)

7650

++		return;

7651

++

7652

++	bfqq->pos_root = &bfqd->rq_pos_tree;

7653

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

7654

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

7655

++	if (__bfqq == NULL) {

7656

++		rb_link_node(&bfqq->pos_node, parent, p);

7657

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

7658

++	} else

7659

++		bfqq->pos_root = NULL;

7660

++}

7661

++

7662

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

7663

++					struct bfq_queue *bfqq,

7664

++					struct request *last)

7665

++{

7666

++	struct rb_node *rbnext = rb_next(&last->rb_node);

7667

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

7668

++	struct request *next = NULL, *prev = NULL;

7669

++

7670

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

7671

++

7672

++	if (rbprev != NULL)

7673

++		prev = rb_entry_rq(rbprev);

7674

++

7675

++	if (rbnext != NULL)

7676

++		next = rb_entry_rq(rbnext);

7677

++	else {

7678

++		rbnext = rb_first(&bfqq->sort_list);

7679

++		if (rbnext && rbnext != &last->rb_node)

7680

++			next = rb_entry_rq(rbnext);

7681

++	}

7682

++

7683

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

7684

++}

7685

++

7686

++static void bfq_del_rq_rb(struct request *rq)

7687

++{

7688

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7689

++	struct bfq_data *bfqd = bfqq->bfqd;

7690

++	const int sync = rq_is_sync(rq);

7691

++

7692

++	BUG_ON(bfqq->queued[sync] == 0);

7693

++	bfqq->queued[sync]--;

7694

++	bfqd->queued--;

7695

++

7696

++	elv_rb_del(&bfqq->sort_list, rq);

7697

++

7698

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

7699

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

7700

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

7701

++		/*

7702

++		 * Remove queue from request-position tree as it is empty.

7703

++		 */

7704

++		if (bfqq->pos_root != NULL) {

7705

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

7706

++			bfqq->pos_root = NULL;

7707

++		}

7708

++	}

7709

++}

7710

++

7711

++/* see the definition of bfq_async_charge_factor for details */

7712

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

7713

++					       struct bfq_queue *bfqq)

7714

++{

7715

++	return blk_rq_sectors(rq) *

7716

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

7717

++		bfq_async_charge_factor));

7718

++}

7719

++

7720

++/**

7721

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

7722

++ * @bfqd: the device data the queue belongs to.

7723

++ * @bfqq: the queue to update.

7724

++ *

7725

++ * If the first request of a queue changes we make sure that the queue

7726

++ * has enough budget to serve at least its first request (if the

7727

++ * request has grown).  We do this because if the queue has not enough

7728

++ * budget for its first request, it has to go through two dispatch

7729

++ * rounds to actually get it dispatched.

7730

++ */

7731

++static void bfq_updated_next_req(struct bfq_data *bfqd,

7732

++				 struct bfq_queue *bfqq)

7733

++{

7734

++	struct bfq_entity *entity = &bfqq->entity;

7735

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

7736

++	struct request *next_rq = bfqq->next_rq;

7737

++	unsigned long new_budget;

7738

++

7739

++	if (next_rq == NULL)

7740

++		return;

7741

++

7742

++	if (bfqq == bfqd->in_service_queue)

7743

++		/*

7744

++		 * In order not to break guarantees, budgets cannot be

7745

++		 * changed after an entity has been selected.

7746

++		 */

7747

++		return;

7748

++

7749

++	BUG_ON(entity->tree != &st->active);

7750

++	BUG_ON(entity == entity->sched_data->in_service_entity);

7751

++

7752

++	new_budget = max_t(unsigned long, bfqq->max_budget,

7753

++			   bfq_serv_to_charge(next_rq, bfqq));

7754

++	entity->budget = new_budget;

7755

++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

7756

++	bfq_activate_bfqq(bfqd, bfqq);

7757

++}

7758

++

7759

++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

7760

++{

7761

++	u64 dur;

7762

++

7763

++	if (bfqd->bfq_raising_max_time > 0)

7764

++		return bfqd->bfq_raising_max_time;

7765

++

7766

++	dur = bfqd->RT_prod;

7767

++	do_div(dur, bfqd->peak_rate);

7768

++

7769

++	return dur;

7770

++}

7771

++

7772

++static void bfq_add_rq_rb(struct request *rq)

7773

++{

7774

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7775

++	struct bfq_entity *entity = &bfqq->entity;

7776

++	struct bfq_data *bfqd = bfqq->bfqd;

7777

++	struct request *next_rq, *prev;

7778

++	unsigned long old_raising_coeff = bfqq->raising_coeff;

7779

++	int idle_for_long_time = 0;

7780

++

7781

++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

7782

++	bfqq->queued[rq_is_sync(rq)]++;

7783

++	bfqd->queued++;

7784

++

7785

++	elv_rb_add(&bfqq->sort_list, rq);

7786

++

7787

++	/*

7788

++	 * Check if this request is a better next-serve candidate.

7789

++	 */

7790

++	prev = bfqq->next_rq;

7791

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

7792

++	BUG_ON(next_rq == NULL);

7793

++	bfqq->next_rq = next_rq;

7794

++

7795

++	/*

7796

++	 * Adjust priority tree position, if next_rq changes.

7797

++	 */

7798

++	if (prev != bfqq->next_rq)

7799

++		bfq_rq_pos_tree_add(bfqd, bfqq);

7800

++

7801

++	if (!bfq_bfqq_busy(bfqq)) {

7802

++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

7803

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

7804

++		idle_for_long_time = time_is_before_jiffies(

7805

++			bfqq->budget_timeout +

7806

++			bfqd->bfq_raising_min_idle_time);

7807

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

7808

++				       bfq_serv_to_charge(next_rq, bfqq));

7809

++

7810

++		if (!bfqd->low_latency)

7811

++			goto add_bfqq_busy;

7812

++

7813

++		/*

7814

++		 * If the queue is not being boosted and has been idle

7815

++		 * for enough time, start a weight-raising period

7816

++		 */

7817

++		if (old_raising_coeff == 1 &&

7818

++		    (idle_for_long_time || soft_rt)) {

7819

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7820

++			if (idle_for_long_time)

7821

++				bfqq->raising_cur_max_time =

7822

++					bfq_wrais_duration(bfqd);

7823

++			else

7824

++				bfqq->raising_cur_max_time =

7825

++					bfqd->bfq_raising_rt_max_time;

7826

++			bfq_log_bfqq(bfqd, bfqq,

7827

++				     "wrais starting at %lu, "

7828

++				     "rais_max_time %u",

7829

++				     jiffies,

7830

++				     jiffies_to_msecs(bfqq->

7831

++					raising_cur_max_time));

7832

++		} else if (old_raising_coeff > 1) {

7833

++			if (idle_for_long_time)

7834

++				bfqq->raising_cur_max_time =

7835

++					bfq_wrais_duration(bfqd);

7836

++			else if (bfqq->raising_cur_max_time ==

7837

++				 bfqd->bfq_raising_rt_max_time &&

7838

++				 !soft_rt) {

7839

++				bfqq->raising_coeff = 1;

7840

++				bfq_log_bfqq(bfqd, bfqq,

7841

++					     "wrais ending at %lu, "

7842

++					     "rais_max_time %u",

7843

++					     jiffies,

7844

++					     jiffies_to_msecs(bfqq->

7845

++						raising_cur_max_time));

7846

++			} else if (time_before(

7847

++					bfqq->last_rais_start_finish +

7848

++					bfqq->raising_cur_max_time,

7849

++					jiffies +

7850

++					bfqd->bfq_raising_rt_max_time) &&

7851

++				   soft_rt) {

7852

++				/*

7853

++				 *

7854

++				 * The remaining weight-raising time is lower

7855

++				 * than bfqd->bfq_raising_rt_max_time, which

7856

++				 * means that the application is enjoying

7857

++				 * weight raising either because deemed soft rt

7858

++				 * in the near past, or because deemed

7859

++				 * interactive a long ago. In both cases,

7860

++				 * resetting now the current remaining weight-

7861

++				 * raising time for the application to the

7862

++				 * weight-raising duration for soft rt

7863

++				 * applications would not cause any latency

7864

++				 * increase for the application (as the new

7865

++				 * duration would be higher than the remaining

7866

++				 * time).

7867

++				 *

7868

++				 * In addition, the application is now meeting

7869

++				 * the requirements for being deemed soft rt.

7870

++				 * In the end we can correctly and safely

7871

++				 * (re)charge the weight-raising duration for

7872

++				 * the application with the weight-raising

7873

++				 * duration for soft rt applications.

7874

++				 *

7875

++				 * In particular, doing this recharge now, i.e.,

7876

++				 * before the weight-raising period for the

7877

++				 * application finishes, reduces the probability

7878

++				 * of the following negative scenario:

7879

++				 * 1) the weight of a soft rt application is

7880

++				 *    raised at startup (as for any newly

7881

++				 *    created application),

7882

++				 * 2) since the application is not interactive,

7883

++				 *    at a certain time weight-raising is

7884

++				 *    stopped for the application,

7885

++				 * 3) at that time the application happens to

7886

++				 *    still have pending requests, and hence

7887

++				 *    is destined to not have a chance to be

7888

++				 *    deemed soft rt before these requests are

7889

++				 *    completed (see the comments to the

7890

++				 *    function bfq_bfqq_softrt_next_start()

7891

++				 *    for details on soft rt detection),

7892

++				 * 4) these pending requests experience a high

7893

++				 *    latency because the application is not

7894

++				 *    weight-raised while they are pending.

7895

++				 */

7896

++				bfqq->last_rais_start_finish = jiffies;

7897

++				bfqq->raising_cur_max_time =

7898

++					bfqd->bfq_raising_rt_max_time;

7899

++			}

7900

++		}

7901

++		if (old_raising_coeff != bfqq->raising_coeff)

7902

++			entity->ioprio_changed = 1;

7903

++add_bfqq_busy:

7904

++		bfqq->last_idle_bklogged = jiffies;

7905

++		bfqq->service_from_backlogged = 0;

7906

++		bfq_clear_bfqq_softrt_update(bfqq);

7907

++		bfq_add_bfqq_busy(bfqd, bfqq);

7908

++	} else {

7909

++		if (bfqd->low_latency && old_raising_coeff == 1 &&

7910

++			!rq_is_sync(rq) &&

7911

++			time_is_before_jiffies(

7912

++				bfqq->last_rais_start_finish +

7913

++				bfqd->bfq_raising_min_inter_arr_async)) {

7914

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7915

++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

7916

++

7917

++			bfqd->raised_busy_queues++;

7918

++			entity->ioprio_changed = 1;

7919

++			bfq_log_bfqq(bfqd, bfqq,

7920

++				     "non-idle wrais starting at %lu, "

7921

++				     "rais_max_time %u",

7922

++				     jiffies,

7923

++				     jiffies_to_msecs(bfqq->

7924

++					raising_cur_max_time));

7925

++		}

7926

++		bfq_updated_next_req(bfqd, bfqq);

7927

++	}

7928

++

7929

++	if (bfqd->low_latency &&

7930

++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

7931

++		 idle_for_long_time))

7932

++		bfqq->last_rais_start_finish = jiffies;

7933

++}

7934

++

7935

++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

7936

++{

7937

++	elv_rb_del(&bfqq->sort_list, rq);

7938

++	bfqq->queued[rq_is_sync(rq)]--;

7939

++	bfqq->bfqd->queued--;

7940

++	bfq_add_rq_rb(rq);

7941

++}

7942

++

7943

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

7944

++					  struct bio *bio)

7945

++{

7946

++	struct task_struct *tsk = current;

7947

++	struct bfq_io_cq *bic;

7948

++	struct bfq_queue *bfqq;

7949

++

7950

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

7951

++	if (bic == NULL)

7952

++		return NULL;

7953

++

7954

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7955

++	if (bfqq != NULL)

7956

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

7957

++

7958

++	return NULL;

7959

++}

7960

++

7961

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

7962

++{

7963

++	struct bfq_data *bfqd = q->elevator->elevator_data;

7964

++

7965

++	bfqd->rq_in_driver++;

7966

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

7967

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

7968

++		(long long unsigned)bfqd->last_position);

7969

++}

7970

++

7971

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

7972

++{

7973

++	struct bfq_data *bfqd = q->elevator->elevator_data;

7974

++

7975

++	WARN_ON(bfqd->rq_in_driver == 0);

7976

++	bfqd->rq_in_driver--;

7977

++}

7978

++

7979

++static void bfq_remove_request(struct request *rq)

7980

++{

7981

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7982

++	struct bfq_data *bfqd = bfqq->bfqd;

7983

++

7984

++	if (bfqq->next_rq == rq) {

7985

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

7986

++		bfq_updated_next_req(bfqd, bfqq);

7987

++	}

7988

++

7989

++	list_del_init(&rq->queuelist);

7990

++	bfq_del_rq_rb(rq);

7991

++

7992

++	if (rq->cmd_flags & REQ_META) {

7993

++		WARN_ON(bfqq->meta_pending == 0);

7994

++		bfqq->meta_pending--;

7995

++	}

7996

++}

7997

++

7998

++static int bfq_merge(struct request_queue *q, struct request **req,

7999

++		     struct bio *bio)

8000

++{

8001

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8002

++	struct request *__rq;

8003

++

8004

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

8005

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

8006

++		*req = __rq;

8007

++		return ELEVATOR_FRONT_MERGE;

8008

++	}

8009

++

8010

++	return ELEVATOR_NO_MERGE;

8011

++}

8012

++

8013

++static void bfq_merged_request(struct request_queue *q, struct request *req,

8014

++			       int type)

8015

++{

8016

++	if (type == ELEVATOR_FRONT_MERGE) {

8017

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

8018

++

8019

++		bfq_reposition_rq_rb(bfqq, req);

8020

++	}

8021

++}

8022

++

8023

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

8024

++				struct request *next)

8025

++{

8026

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8027

++

8028

++	/*

8029

++	 * Reposition in fifo if next is older than rq.

8030

++	 */

8031

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

8032

++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

8033

++		list_move(&rq->queuelist, &next->queuelist);

8034

++		rq_set_fifo_time(rq, rq_fifo_time(next));

8035

++	}

8036

++

8037

++	if (bfqq->next_rq == next)

8038

++		bfqq->next_rq = rq;

8039

++

8040

++	bfq_remove_request(next);

8041

++}

8042

++

8043

++/* Must be called with bfqq != NULL */

8044

++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

8045

++{

8046

++	BUG_ON(bfqq == NULL);

8047

++	if (bfq_bfqq_busy(bfqq))

8048

++		bfqq->bfqd->raised_busy_queues--;

8049

++	bfqq->raising_coeff = 1;

8050

++	bfqq->raising_cur_max_time = 0;

8051

++	/* Trigger a weight change on the next activation of the queue */

8052

++	bfqq->entity.ioprio_changed = 1;

8053

++}

8054

++

8055

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

8056

++					struct bfq_group *bfqg)

8057

++{

8058

++	int i, j;

8059

++

8060

++	for (i = 0; i < 2; i++)

8061

++		for (j = 0; j < IOPRIO_BE_NR; j++)

8062

++			if (bfqg->async_bfqq[i][j] != NULL)

8063

++				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

8064

++	if (bfqg->async_idle_bfqq != NULL)

8065

++		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

8066

++}

8067

++

8068

++static void bfq_end_raising(struct bfq_data *bfqd)

8069

++{

8070

++	struct bfq_queue *bfqq;

8071

++

8072

++	spin_lock_irq(bfqd->queue->queue_lock);

8073

++

8074

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

8075

++		bfq_bfqq_end_raising(bfqq);

8076

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

8077

++		bfq_bfqq_end_raising(bfqq);

8078

++	bfq_end_raising_async(bfqd);

8079

++

8080

++	spin_unlock_irq(bfqd->queue->queue_lock);

8081

++}

8082

++

8083

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

8084

++			   struct bio *bio)

8085

++{

8086

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8087

++	struct bfq_io_cq *bic;

8088

++	struct bfq_queue *bfqq;

8089

++

8090

++	/*

8091

++	 * Disallow merge of a sync bio into an async request.

8092

++	 */

8093

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

8094

++		return 0;

8095

++

8096

++	/*

8097

++	 * Lookup the bfqq that this bio will be queued with. Allow

8098

++	 * merge only if rq is queued there.

8099

++	 * Queue lock is held here.

8100

++	 */

8101

++	bic = bfq_bic_lookup(bfqd, current->io_context);

8102

++	if (bic == NULL)

8103

++		return 0;

8104

++

8105

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

8106

++	return bfqq == RQ_BFQQ(rq);

8107

++}

8108

++

8109

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

8110

++				       struct bfq_queue *bfqq)

8111

++{

8112

++	if (bfqq != NULL) {

8113

++		bfq_mark_bfqq_must_alloc(bfqq);

8114

++		bfq_mark_bfqq_budget_new(bfqq);

8115

++		bfq_clear_bfqq_fifo_expire(bfqq);

8116

++

8117

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

8118

++

8119

++		bfq_log_bfqq(bfqd, bfqq,

8120

++			     "set_in_service_queue, cur-budget = %lu",

8121

++			     bfqq->entity.budget);

8122

++	}

8123

++

8124

++	bfqd->in_service_queue = bfqq;

8125

++}

8126

++

8127

++/*

8128

++ * Get and set a new queue for service.

8129

++ */

8130

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

8131

++						  struct bfq_queue *bfqq)

8132

++{

8133

++	if (!bfqq)

8134

++		bfqq = bfq_get_next_queue(bfqd);

8135

++	else

8136

++		bfq_get_next_queue_forced(bfqd, bfqq);

8137

++

8138

++	__bfq_set_in_service_queue(bfqd, bfqq);

8139

++	return bfqq;

8140

++}

8141

++

8142

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

8143

++					  struct request *rq)

8144

++{

8145

++	if (blk_rq_pos(rq) >= bfqd->last_position)

8146

++		return blk_rq_pos(rq) - bfqd->last_position;

8147

++	else

8148

++		return bfqd->last_position - blk_rq_pos(rq);

8149

++}

8150

++

8151

++/*

8152

++ * Return true if bfqq has no request pending and rq is close enough to

8153

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

8154

++ * bfqq->next_rq

8155

++ */

8156

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

8157

++{

8158

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

8159

++}

8160

++

8161

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

8162

++{

8163

++	struct rb_root *root = &bfqd->rq_pos_tree;

8164

++	struct rb_node *parent, *node;

8165

++	struct bfq_queue *__bfqq;

8166

++	sector_t sector = bfqd->last_position;

8167

++

8168

++	if (RB_EMPTY_ROOT(root))

8169

++		return NULL;

8170

++

8171

++	/*

8172

++	 * First, if we find a request starting at the end of the last

8173

++	 * request, choose it.

8174

++	 */

8175

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

8176

++	if (__bfqq != NULL)

8177

++		return __bfqq;

8178

++

8179

++	/*

8180

++	 * If the exact sector wasn't found, the parent of the NULL leaf

8181

++	 * will contain the closest sector (rq_pos_tree sorted by next_request

8182

++	 * position).

8183

++	 */

8184

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

8185

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8186

++		return __bfqq;

8187

++

8188

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

8189

++		node = rb_next(&__bfqq->pos_node);

8190

++	else

8191

++		node = rb_prev(&__bfqq->pos_node);

8192

++	if (node == NULL)

8193

++		return NULL;

8194

++

8195

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

8196

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8197

++		return __bfqq;

8198

++

8199

++	return NULL;

8200

++}

8201

++

8202

++/*

8203

++ * bfqd - obvious

8204

++ * cur_bfqq - passed in so that we don't decide that the current queue

8205

++ *            is closely cooperating with itself.

8206

++ *

8207

++ * We are assuming that cur_bfqq has dispatched at least one request,

8208

++ * and that bfqd->last_position reflects a position on the disk associated

8209

++ * with the I/O issued by cur_bfqq.

8210

++ */

8211

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

8212

++					      struct bfq_queue *cur_bfqq)

8213

++{

8214

++	struct bfq_queue *bfqq;

8215

++

8216

++	if (bfq_class_idle(cur_bfqq))

8217

++		return NULL;

8218

++	if (!bfq_bfqq_sync(cur_bfqq))

8219

++		return NULL;

8220

++	if (BFQQ_SEEKY(cur_bfqq))

8221

++		return NULL;

8222

++

8223

++	/* If device has only one backlogged bfq_queue, don't search. */

8224

++	if (bfqd->busy_queues == 1)

8225

++		return NULL;

8226

++

8227

++	/*

8228

++	 * We should notice if some of the queues are cooperating, e.g.

8229

++	 * working closely on the same area of the disk. In that case,

8230

++	 * we can group them together and don't waste time idling.

8231

++	 */

8232

++	bfqq = bfqq_close(bfqd);

8233

++	if (bfqq == NULL || bfqq == cur_bfqq)

8234

++		return NULL;

8235

++

8236

++	/*

8237

++	 * Do not merge queues from different bfq_groups.

8238

++	*/

8239

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

8240

++		return NULL;

8241

++

8242

++	/*

8243

++	 * It only makes sense to merge sync queues.

8244

++	 */

8245

++	if (!bfq_bfqq_sync(bfqq))

8246

++		return NULL;

8247

++	if (BFQQ_SEEKY(bfqq))

8248

++		return NULL;

8249

++

8250

++	/*

8251

++	 * Do not merge queues of different priority classes.

8252

++	 */

8253

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

8254

++		return NULL;

8255

++

8256

++	return bfqq;

8257

++}

8258

++

8259

++/*

8260

++ * If enough samples have been computed, return the current max budget

8261

++ * stored in bfqd, which is dynamically updated according to the

8262

++ * estimated disk peak rate; otherwise return the default max budget

8263

++ */

8264

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

8265

++{

8266

++	if (bfqd->budgets_assigned < 194)

8267

++		return bfq_default_max_budget;

8268

++	else

8269

++		return bfqd->bfq_max_budget;

8270

++}

8271

++

8272

++/*

8273

++ * Return min budget, which is a fraction of the current or default

8274

++ * max budget (trying with 1/32)

8275

++ */

8276

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

8277

++{

8278

++	if (bfqd->budgets_assigned < 194)

8279

++		return bfq_default_max_budget / 32;

8280

++	else

8281

++		return bfqd->bfq_max_budget / 32;

8282

++}

8283

++

8284

++/*

8285

++ * Decides whether idling should be done for given device and

8286

++ * given in-service queue.

8287

++ */

8288

++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

8289

++					   struct bfq_queue *in_service_bfqq)

8290

++{

8291

++	if (in_service_bfqq == NULL)

8292

++		return false;

8293

++	/*

8294

++	 * If the device is non-rotational, and hence has no seek penalty,

8295

++	 * disable idling; but do so only if:

8296

++	 * - device does not support queuing, otherwise we still have

8297

++	 *   a problem with sync vs async workloads;

8298

++	 * - the queue is not weight-raised, to preserve guarantees.

8299

++	 */

8300

++	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

8301

++		in_service_bfqq->raising_coeff == 1);

8302

++}

8303

++

8304

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

8305

++{

8306

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

8307

++	struct bfq_io_cq *bic;

8308

++	unsigned long sl;

8309

++

8310

++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

8311

++

8312

++	/* Tasks have exited, don't wait. */

8313

++	bic = bfqd->in_service_bic;

8314

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

8315

++		return;

8316

++

8317

++	bfq_mark_bfqq_wait_request(bfqq);

8318

++

8319

++	/*

8320

++	 * We don't want to idle for seeks, but we do want to allow

8321

++	 * fair distribution of slice time for a process doing back-to-back

8322

++	 * seeks. So allow a little bit of time for him to submit a new rq.

8323

++	 *

8324

++	 * To prevent processes with (partly) seeky workloads from

8325

++	 * being too ill-treated, grant them a small fraction of the

8326

++	 * assigned budget before reducing the waiting time to

8327

++	 * BFQ_MIN_TT. This happened to help reduce latency.

8328

++	 */

8329

++	sl = bfqd->bfq_slice_idle;

8330

++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

8331

++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

8332

++	    bfqq->raising_coeff == 1)

8333

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

8334

++	else if (bfqq->raising_coeff > 1)

8335

++		sl = sl * 3;

8336

++	bfqd->last_idling_start = ktime_get();

8337

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

8338

++	bfq_log(bfqd, "arm idle: %u/%u ms",

8339

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

8340

++}

8341

++

8342

++/*

8343

++ * Set the maximum time for the in-service queue to consume its

8344

++ * budget. This prevents seeky processes from lowering the disk

8345

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

8346

++ */

8347

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

8348

++{

8349

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

8350

++	unsigned int timeout_coeff;

8351

++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

8352

++		timeout_coeff = 1;

8353

++	else

8354

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

8355

++

8356

++	bfqd->last_budget_start = ktime_get();

8357

++

8358

++	bfq_clear_bfqq_budget_new(bfqq);

8359

++	bfqq->budget_timeout = jiffies +

8360

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

8361

++

8362

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

8363

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

8364

++		timeout_coeff));

8365

++}

8366

++

8367

++/*

8368

++ * Move request from internal lists to the request queue dispatch list.

8369

++ */

8370

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

8371

++{

8372

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8373

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8374

++

8375

++	bfq_remove_request(rq);

8376

++	bfqq->dispatched++;

8377

++	elv_dispatch_sort(q, rq);

8378

++

8379

++	if (bfq_bfqq_sync(bfqq))

8380

++		bfqd->sync_flight++;

8381

++}

8382

++

8383

++/*

8384

++ * Return expired entry, or NULL to just start from scratch in rbtree.

8385

++ */

8386

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

8387

++{

8388

++	struct request *rq = NULL;

8389

++

8390

++	if (bfq_bfqq_fifo_expire(bfqq))

8391

++		return NULL;

8392

++

8393

++	bfq_mark_bfqq_fifo_expire(bfqq);

8394

++

8395

++	if (list_empty(&bfqq->fifo))

8396

++		return NULL;

8397

++

8398

++	rq = rq_entry_fifo(bfqq->fifo.next);

8399

++

8400

++	if (time_before(jiffies, rq_fifo_time(rq)))

8401

++		return NULL;

8402

++

8403

++	return rq;

8404

++}

8405

++

8406

++/*

8407

++ * Must be called with the queue_lock held.

8408

++ */

8409

++static int bfqq_process_refs(struct bfq_queue *bfqq)

8410

++{

8411

++	int process_refs, io_refs;

8412

++

8413

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

8414

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

8415

++	BUG_ON(process_refs < 0);

8416

++	return process_refs;

8417

++}

8418

++

8419

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

8420

++{

8421

++	int process_refs, new_process_refs;

8422

++	struct bfq_queue *__bfqq;

8423

++

8424

++	/*

8425

++	 * If there are no process references on the new_bfqq, then it is

8426

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

8427

++	 * may have dropped their last reference (not just their last process

8428

++	 * reference).

8429

++	 */

8430

++	if (!bfqq_process_refs(new_bfqq))

8431

++		return;

8432

++

8433

++	/* Avoid a circular list and skip interim queue merges. */

8434

++	while ((__bfqq = new_bfqq->new_bfqq)) {

8435

++		if (__bfqq == bfqq)

8436

++			return;

8437

++		new_bfqq = __bfqq;

8438

++	}

8439

++

8440

++	process_refs = bfqq_process_refs(bfqq);

8441

++	new_process_refs = bfqq_process_refs(new_bfqq);

8442

++	/*

8443

++	 * If the process for the bfqq has gone away, there is no

8444

++	 * sense in merging the queues.

8445

++	 */

8446

++	if (process_refs == 0 || new_process_refs == 0)

8447

++		return;

8448

++

8449

++	/*

8450

++	 * Merge in the direction of the lesser amount of work.

8451

++	 */

8452

++	if (new_process_refs >= process_refs) {

8453

++		bfqq->new_bfqq = new_bfqq;

8454

++		atomic_add(process_refs, &new_bfqq->ref);

8455

++	} else {

8456

++		new_bfqq->new_bfqq = bfqq;

8457

++		atomic_add(new_process_refs, &bfqq->ref);

8458

++	}

8459

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

8460

++		new_bfqq->pid);

8461

++}

8462

++

8463

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

8464

++{

8465

++	struct bfq_entity *entity = &bfqq->entity;

8466

++	return entity->budget - entity->service;

8467

++}

8468

++

8469

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8470

++{

8471

++	BUG_ON(bfqq != bfqd->in_service_queue);

8472

++

8473

++	__bfq_bfqd_reset_in_service(bfqd);

8474

++

8475

++	/*

8476

++	 * If this bfqq is shared between multiple processes, check

8477

++	 * to make sure that those processes are still issuing I/Os

8478

++	 * within the mean seek distance. If not, it may be time to

8479

++	 * break the queues apart again.

8480

++	 */

8481

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

8482

++		bfq_mark_bfqq_split_coop(bfqq);

8483

++

8484

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

8485

++		/*

8486

++		 * overloading budget_timeout field to store when

8487

++		 * the queue remains with no backlog, used by

8488

++		 * the weight-raising mechanism

8489

++		 */

8490

++		bfqq->budget_timeout = jiffies;

8491

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

8492

++	} else {

8493

++		bfq_activate_bfqq(bfqd, bfqq);

8494

++		/*

8495

++		 * Resort priority tree of potential close cooperators.

8496

++		 */

8497

++		bfq_rq_pos_tree_add(bfqd, bfqq);

8498

++	}

8499

++}

8500

++

8501

++/**

8502

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

8503

++ * @bfqd: device data.

8504

++ * @bfqq: queue to update.

8505

++ * @reason: reason for expiration.

8506

++ *

8507

++ * Handle the feedback on @bfqq budget.  See the body for detailed

8508

++ * comments.

8509

++ */

8510

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

8511

++				     struct bfq_queue *bfqq,

8512

++				     enum bfqq_expiration reason)

8513

++{

8514

++	struct request *next_rq;

8515

++	unsigned long budget, min_budget;

8516

++

8517

++	budget = bfqq->max_budget;

8518

++	min_budget = bfq_min_budget(bfqd);

8519

++

8520

++	BUG_ON(bfqq != bfqd->in_service_queue);

8521

++

8522

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

8523

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

8524

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

8525

++		budget, bfq_min_budget(bfqd));

8526

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

8527

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

8528

++

8529

++	if (bfq_bfqq_sync(bfqq)) {

8530

++		switch (reason) {

8531

++		/*

8532

++		 * Caveat: in all the following cases we trade latency

8533

++		 * for throughput.

8534

++		 */

8535

++		case BFQ_BFQQ_TOO_IDLE:

8536

++			/*

8537

++			 * This is the only case where we may reduce

8538

++			 * the budget: if there is no requets of the

8539

++			 * process still waiting for completion, then

8540

++			 * we assume (tentatively) that the timer has

8541

++			 * expired because the batch of requests of

8542

++			 * the process could have been served with a

8543

++			 * smaller budget.  Hence, betting that

8544

++			 * process will behave in the same way when it

8545

++			 * becomes backlogged again, we reduce its

8546

++			 * next budget.  As long as we guess right,

8547

++			 * this budget cut reduces the latency

8548

++			 * experienced by the process.

8549

++			 *

8550

++			 * However, if there are still outstanding

8551

++			 * requests, then the process may have not yet

8552

++			 * issued its next request just because it is

8553

++			 * still waiting for the completion of some of

8554

++			 * the still oustanding ones.  So in this

8555

++			 * subcase we do not reduce its budget, on the

8556

++			 * contrary we increase it to possibly boost

8557

++			 * the throughput, as discussed in the

8558

++			 * comments to the BUDGET_TIMEOUT case.

8559

++			 */

8560

++			if (bfqq->dispatched > 0) /* still oustanding reqs */

8561

++				budget = min(budget * 2, bfqd->bfq_max_budget);

8562

++			else {

8563

++				if (budget > 5 * min_budget)

8564

++					budget -= 4 * min_budget;

8565

++				else

8566

++					budget = min_budget;

8567

++			}

8568

++			break;

8569

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

8570

++			/*

8571

++			 * We double the budget here because: 1) it

8572

++			 * gives the chance to boost the throughput if

8573

++			 * this is not a seeky process (which may have

8574

++			 * bumped into this timeout because of, e.g.,

8575

++			 * ZBR), 2) together with charge_full_budget

8576

++			 * it helps give seeky processes higher

8577

++			 * timestamps, and hence be served less

8578

++			 * frequently.

8579

++			 */

8580

++			budget = min(budget * 2, bfqd->bfq_max_budget);

8581

++			break;

8582

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

8583

++			/*

8584

++			 * The process still has backlog, and did not

8585

++			 * let either the budget timeout or the disk

8586

++			 * idling timeout expire. Hence it is not

8587

++			 * seeky, has a short thinktime and may be

8588

++			 * happy with a higher budget too. So

8589

++			 * definitely increase the budget of this good

8590

++			 * candidate to boost the disk throughput.

8591

++			 */

8592

++			budget = min(budget * 4, bfqd->bfq_max_budget);

8593

++			break;

8594

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

8595

++		       /*

8596

++			* Leave the budget unchanged.

8597

++			*/

8598

++		default:

8599

++			return;

8600

++		}

8601

++	} else /* async queue */

8602

++	    /* async queues get always the maximum possible budget

8603

++	     * (their ability to dispatch is limited by

8604

++	     * @bfqd->bfq_max_budget_async_rq).

8605

++	     */

8606

++		budget = bfqd->bfq_max_budget;

8607

++

8608

++	bfqq->max_budget = budget;

8609

++

8610

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

8611

++	    bfqq->max_budget > bfqd->bfq_max_budget)

8612

++		bfqq->max_budget = bfqd->bfq_max_budget;

8613

++

8614

++	/*

8615

++	 * Make sure that we have enough budget for the next request.

8616

++	 * Since the finish time of the bfqq must be kept in sync with

8617

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

8618

++	 * update.

8619

++	 */

8620

++	next_rq = bfqq->next_rq;

8621

++	if (next_rq != NULL)

8622

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

8623

++					    bfq_serv_to_charge(next_rq, bfqq));

8624

++	else

8625

++		bfqq->entity.budget = bfqq->max_budget;

8626

++

8627

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

8628

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

8629

++			bfqq->entity.budget);

8630

++}

8631

++

8632

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

8633

++{

8634

++	unsigned long max_budget;

8635

++

8636

++	/*

8637

++	 * The max_budget calculated when autotuning is equal to the

8638

++	 * amount of sectors transfered in timeout_sync at the

8639

++	 * estimated peak rate.

8640

++	 */

8641

++	max_budget = (unsigned long)(peak_rate * 1000 *

8642

++				     timeout >> BFQ_RATE_SHIFT);

8643

++

8644

++	return max_budget;

8645

++}

8646

++

8647

++/*

8648

++ * In addition to updating the peak rate, checks whether the process

8649

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

8650

++ * to the budget timeout, to reduce the amount of service provided to

8651

++ * seeky processes, and hence reduce their chances to lower the

8652

++ * throughput. See the code for more details.

8653

++ */

8654

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8655

++				int compensate, enum bfqq_expiration reason)

8656

++{

8657

++	u64 bw, usecs, expected, timeout;

8658

++	ktime_t delta;

8659

++	int update = 0;

8660

++

8661

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

8662

++		return 0;

8663

++

8664

++	if (compensate)

8665

++		delta = bfqd->last_idling_start;

8666

++	else

8667

++		delta = ktime_get();

8668

++	delta = ktime_sub(delta, bfqd->last_budget_start);

8669

++	usecs = ktime_to_us(delta);

8670

++

8671

++	/* Don't trust short/unrealistic values. */

8672

++	if (usecs < 100 || usecs >= LONG_MAX)

8673

++		return 0;

8674

++

8675

++	/*

8676

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

8677

++	 * value to store the peak rate, in sectors per usec in fixed

8678

++	 * point math.  We do so to have enough precision in the estimate

8679

++	 * and to avoid overflows.

8680

++	 */

8681

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

8682

++	do_div(bw, (unsigned long)usecs);

8683

++

8684

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

8685

++

8686

++	/*

8687

++	 * Use only long (> 20ms) intervals to filter out spikes for

8688

++	 * the peak rate estimation.

8689

++	 */

8690

++	if (usecs > 20000) {

8691

++		if (bw > bfqd->peak_rate ||

8692

++		   (!BFQQ_SEEKY(bfqq) &&

8693

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

8694

++			bfq_log(bfqd, "measured bw =%llu", bw);

8695

++			/*

8696

++			 * To smooth oscillations use a low-pass filter with

8697

++			 * alpha=7/8, i.e.,

8698

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

8699

++			 */

8700

++			do_div(bw, 8);

8701

++			if (bw == 0)

8702

++				return 0;

8703

++			bfqd->peak_rate *= 7;

8704

++			do_div(bfqd->peak_rate, 8);

8705

++			bfqd->peak_rate += bw;

8706

++			update = 1;

8707

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

8708

++		}

8709

++

8710

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

8711

++

8712

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

8713

++			bfqd->peak_rate_samples++;

8714

++

8715

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

8716

++		    update && bfqd->bfq_user_max_budget == 0) {

8717

++			bfqd->bfq_max_budget =

8718

++				bfq_calc_max_budget(bfqd->peak_rate, timeout);

8719

++			bfq_log(bfqd, "new max_budget=%lu",

8720

++				bfqd->bfq_max_budget);

8721

++		}

8722

++	}

8723

++

8724

++	/*

8725

++	 * If the process has been served for a too short time

8726

++	 * interval to let its possible sequential accesses prevail on

8727

++	 * the initial seek time needed to move the disk head on the

8728

++	 * first sector it requested, then give the process a chance

8729

++	 * and for the moment return false.

8730

++	 */

8731

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

8732

++		return 0;

8733

++

8734

++	/*

8735

++	 * A process is considered ``slow'' (i.e., seeky, so that we

8736

++	 * cannot treat it fairly in the service domain, as it would

8737

++	 * slow down too much the other processes) if, when a slice

8738

++	 * ends for whatever reason, it has received service at a

8739

++	 * rate that would not be high enough to complete the budget

8740

++	 * before the budget timeout expiration.

8741

++	 */

8742

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

8743

++

8744

++	/*

8745

++	 * Caveat: processes doing IO in the slower disk zones will

8746

++	 * tend to be slow(er) even if not seeky. And the estimated

8747

++	 * peak rate will actually be an average over the disk

8748

++	 * surface. Hence, to not be too harsh with unlucky processes,

8749

++	 * we keep a budget/3 margin of safety before declaring a

8750

++	 * process slow.

8751

++	 */

8752

++	return expected > (4 * bfqq->entity.budget) / 3;

8753

++}

8754

++

8755

++/*

8756

++ * To be deemed as soft real-time, an application must meet two requirements.

8757

++ * The first is that the application must not require an average bandwidth

8758

++ * higher than the approximate bandwidth required to playback or record a

8759

++ * compressed high-definition video.

8760

++ * The next function is invoked on the completion of the last request of a

8761

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

8762

++ * that, if the next request of the application does not arrive before

8763

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

8764

++ *

8765

++ * The second requirement is that the request pattern of the application is

8766

++ * isochronous, i.e., that, after issuing a request or a batch of requests, the

8767

++ * application stops for a while, then issues a new batch, and so on. For this

8768

++ * reason the next function is invoked to compute soft_rt_next_start only for

8769

++ * applications that meet this requirement, whereas soft_rt_next_start is set

8770

++ * to infinity for applications that do not.

8771

++ *

8772

++ * Unfortunately, even a greedy application may happen to behave in an

8773

++ * isochronous way if several processes are competing for the CPUs. In fact,

8774

++ * in this scenario the application stops issuing requests while the CPUs are

8775

++ * busy serving other processes, then restarts, then stops again for a while,

8776

++ * and so on. In addition, if the disk achieves a low enough throughput with

8777

++ * the request pattern issued by the application (e.g., because the request

8778

++ * pattern is random and/or the device is slow), then the above bandwidth

8779

++ * requirement may happen to be met too. To prevent such a greedy application

8780

++ * to be deemed as soft real-time, a further rule is used in the computation

8781

++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

8782

++ * time plus the maximum time for which the arrival of a request is waited

8783

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

8784

++ * filters out greedy applications, as the latter issue instead their next

8785

++ * request as soon as possible after the last one has been completed (in

8786

++ * contrast, when a batch of requests is completed, a soft real-time

8787

++ * application spends some time processing data).

8788

++ *

8789

++ * Actually, the last filter may easily generate false positives if: only

8790

++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

8791

++ * both the following two cases occur:

8792

++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

8793

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

8794

++ *    HZ=100.

8795

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

8796

++ *    for a while, then suddenly 'jump' by several units to recover the lost

8797

++ *    increments. This seems to happen, e.g., inside virtual machines.

8798

++ * To address this issue, we do not use as a reference time interval just

8799

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

8800

++ * particular we add the minimum number of jiffies for which the filter seems

8801

++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

8802

++ */

8803

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

8804

++						       struct bfq_queue *bfqq)

8805

++{

8806

++	return max(bfqq->last_idle_bklogged +

8807

++		   HZ * bfqq->service_from_backlogged /

8808

++		   bfqd->bfq_raising_max_softrt_rate,

8809

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

8810

++}

8811

++

8812

++/*

8813

++ * Largest-possible time instant such that, for as long as possible, the

8814

++ * current time will be lower than this time instant according to the macro

8815

++ * time_is_before_jiffies().

8816

++ */

8817

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

8818

++{

8819

++	return now + ULONG_MAX / 2;

8820

++}

8821

++

8822

++/**

8823

++ * bfq_bfqq_expire - expire a queue.

8824

++ * @bfqd: device owning the queue.

8825

++ * @bfqq: the queue to expire.

8826

++ * @compensate: if true, compensate for the time spent idling.

8827

++ * @reason: the reason causing the expiration.

8828

++ *

8829

++ *

8830

++ * If the process associated to the queue is slow (i.e., seeky), or in

8831

++ * case of budget timeout, or, finally, if it is async, we

8832

++ * artificially charge it an entire budget (independently of the

8833

++ * actual service it received). As a consequence, the queue will get

8834

++ * higher timestamps than the correct ones upon reactivation, and

8835

++ * hence it will be rescheduled as if it had received more service

8836

++ * than what it actually received. In the end, this class of processes

8837

++ * will receive less service in proportion to how slowly they consume

8838

++ * their budgets (and hence how seriously they tend to lower the

8839

++ * throughput).

8840

++ *

8841

++ * In contrast, when a queue expires because it has been idling for

8842

++ * too much or because it exhausted its budget, we do not touch the

8843

++ * amount of service it has received. Hence when the queue will be

8844

++ * reactivated and its timestamps updated, the latter will be in sync

8845

++ * with the actual service received by the queue until expiration.

8846

++ *

8847

++ * Charging a full budget to the first type of queues and the exact

8848

++ * service to the others has the effect of using the WF2Q+ policy to

8849

++ * schedule the former on a timeslice basis, without violating the

8850

++ * service domain guarantees of the latter.

8851

++ */

8852

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

8853

++			    struct bfq_queue *bfqq,

8854

++			    int compensate,

8855

++			    enum bfqq_expiration reason)

8856

++{

8857

++	int slow;

8858

++	BUG_ON(bfqq != bfqd->in_service_queue);

8859

++

8860

++	/* Update disk peak rate for autotuning and check whether the

8861

++	 * process is slow (see bfq_update_peak_rate).

8862

++	 */

8863

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

8864

++

8865

++	/*

8866

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

8867

++	 * and async queues, to favor sequential sync workloads.

8868

++	 *

8869

++	 * Processes doing IO in the slower disk zones will tend to be

8870

++	 * slow(er) even if not seeky. Hence, since the estimated peak

8871

++	 * rate is actually an average over the disk surface, these

8872

++	 * processes may timeout just for bad luck. To avoid punishing

8873

++	 * them we do not charge a full budget to a process that

8874

++	 * succeeded in consuming at least 2/3 of its budget.

8875

++	 */

8876

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

8877

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

8878

++		bfq_bfqq_charge_full_budget(bfqq);

8879

++

8880

++	bfqq->service_from_backlogged += bfqq->entity.service;

8881

++

8882

++	if (bfqd->low_latency && bfqq->raising_coeff == 1)

8883

++		bfqq->last_rais_start_finish = jiffies;

8884

++

8885

++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&

8886

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

8887

++		/*

8888

++		 * If we get here, then the request pattern is

8889

++		 * isochronous (see the comments to the function

8890

++		 * bfq_bfqq_softrt_next_start()). However, if the

8891

++		 * queue still has in-flight requests, then it is

8892

++		 * better to postpone the computation of next_start

8893

++		 * to the next request completion. In fact, if we

8894

++		 * computed it now, then the application might pass

8895

++		 * the greedy-application filter improperly, because

8896

++		 * the arrival of its next request may  happen to be

8897

++		 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

8898

++		 * not because the application is truly soft real-

8899

++		 * time, but just because the application is currently

8900

++		 * waiting for the completion of some request before

8901

++		 * issuing, as quickly as possible, its next request.

8902

++		 */

8903

++		if (bfqq->dispatched > 0) {

8904

++			/*

8905

++			 * The application is still waiting for the

8906

++			 * completion of one or more requests:

8907

++			 * prevent it from possibly being incorrectly

8908

++			 * deemed as soft real-time by setting its

8909

++			 * soft_rt_next_start to infinity. In fact,

8910

++			 * without this assignment, the application

8911

++			 * would be incorrectly deemed as soft

8912

++			 * real-time if:

8913

++			 * 1) it issued a new request before the

8914

++			 *    completion of all its in-flight

8915

++			 *    requests, and

8916

++			 * 2) at that time, its soft_rt_next_start

8917

++			 *    happened to be in the past.

8918

++			 */

8919

++			bfqq->soft_rt_next_start =

8920

++				bfq_infinity_from_now(jiffies);

8921

++			bfq_mark_bfqq_softrt_update(bfqq);

8922

++		} else

8923

++			bfqq->soft_rt_next_start =

8924

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

8925

++	}

8926

++

8927

++	bfq_log_bfqq(bfqd, bfqq,

8928

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

8929

++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

8930

++

8931

++	/* Increase, decrease or leave budget unchanged according to reason */

8932

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

8933

++	__bfq_bfqq_expire(bfqd, bfqq);

8934

++}

8935

++

8936

++/*

8937

++ * Budget timeout is not implemented through a dedicated timer, but

8938

++ * just checked on request arrivals and completions, as well as on

8939

++ * idle timer expirations.

8940

++ */

8941

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

8942

++{

8943

++	if (bfq_bfqq_budget_new(bfqq))

8944

++		return 0;

8945

++

8946

++	if (time_before(jiffies, bfqq->budget_timeout))

8947

++		return 0;

8948

++

8949

++	return 1;

8950

++}

8951

++

8952

++/*

8953

++ * If we expire a queue that is waiting for the arrival of a new

8954

++ * request, we may prevent the fictitious timestamp backshifting that

8955

++ * allows the guarantees of the queue to be preserved (see [1] for

8956

++ * this tricky aspect). Hence we return true only if this condition

8957

++ * does not hold, or if the queue is slow enough to deserve only to be

8958

++ * kicked off for preserving a high throughput.

8959

++*/

8960

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

8961

++{

8962

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

8963

++		"may_budget_timeout: wr %d left %d timeout %d",

8964

++		bfq_bfqq_wait_request(bfqq),

8965

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

8966

++		bfq_bfqq_budget_timeout(bfqq));

8967

++

8968

++	return (!bfq_bfqq_wait_request(bfqq) ||

8969

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

8970

++		&&

8971

++		bfq_bfqq_budget_timeout(bfqq);

8972

++}

8973

++

8974

++/*

8975

++ * For weight-raised queues issuing sync requests, idling is always performed,

8976

++ * as this is instrumental in guaranteeing a high fraction of the throughput

8977

++ * to these queues, and hence in guaranteeing a lower latency for their

8978

++ * requests. See [1] for details.

8979

++ *

8980

++ * For non-weight-raised queues, idling is instead disabled if the device is

8981

++ * NCQ-enabled and non-rotational, as this boosts the throughput on such

8982

++ * devices.

8983

++ */

8984

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

8985

++{

8986

++	struct bfq_data *bfqd = bfqq->bfqd;

8987

++

8988

++	return bfq_bfqq_sync(bfqq) && (

8989

++		bfqq->raising_coeff > 1 ||

8990

++		(bfq_bfqq_idle_window(bfqq) &&

8991

++		 !(bfqd->hw_tag &&

8992

++		   (blk_queue_nonrot(bfqd->queue) ||

8993

++		 /*

8994

++		  * If there are weight-raised busy queues, then do not idle

8995

++		  * the disk for a sync non-weight-raised queue, and hence

8996

++		  * expire the queue immediately if empty. Combined with the

8997

++		  * timestamping rules of BFQ (see [1] for details), this

8998

++		  * causes sync non-weight-raised queues to get a lower

8999

++		  * fraction of the disk throughput, and hence reduces the rate

9000

++		  * at which the processes associated to these queues ask for

9001

++		  * requests from the request pool.

9002

++		  *

9003

++		  * This is beneficial for weight-raised processes, when the

9004

++		  * system operates in request-pool saturation conditions

9005

++		  * (e.g., in the presence of write hogs). In fact, if

9006

++		  * non-weight-raised processes ask for requests at a lower

9007

++		  * rate, then weight-raised processes have a higher

9008

++		  * probability to get a request from the pool immediately

9009

++		  * (or at least soon) when they need one. Hence they have a

9010

++		  * higher probability to actually get a fraction of the disk

9011

++		  * throughput proportional to their high weight. This is

9012

++		  * especially true with NCQ-enabled drives, which enqueue

9013

++		  * several requests in advance and further reorder

9014

++		  * internally-queued requests.

9015

++		  *

9016

++		  * Mistreating non-weight-raised queues in the above-described

9017

++		  * way, when there are busy weight-raised queues, seems to

9018

++		  * mitigate starvation problems in the presence of heavy write

9019

++		  * workloads and NCQ, and hence to guarantee a higher

9020

++		  * application and system responsiveness in these hostile

9021

++		  * scenarios.

9022

++		  */

9023

++		    bfqd->raised_busy_queues > 0)

9024

++		  )

9025

++		)

9026

++	);

9027

++}

9028

++

9029

++/*

9030

++ * If the in-service queue is empty, but it is sync and either of the following

9031

++ * conditions holds, then: 1) the queue must remain in service and cannot be

9032

++ * expired, and 2) the disk must be idled to wait for the possible arrival

9033

++ * of a new request for the queue. The conditions are:

9034

++ * - the device is rotational and not performing NCQ, and the queue has its

9035

++ *   idle window set (in this case, waiting for a new request for the queue

9036

++ *   is likely to boost the disk throughput);

9037

++ * - the queue is weight-raised (waiting for the request is necessary to

9038

++ *   provide the queue with fairness and latency guarantees, see [1] for

9039

++ *   details).

9040

++ */

9041

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

9042

++{

9043

++	struct bfq_data *bfqd = bfqq->bfqd;

9044

++

9045

++	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

9046

++		bfq_bfqq_must_not_expire(bfqq) &&

9047

++		!bfq_queue_nonrot_noidle(bfqd, bfqq));

9048

++}

9049

++

9050

++/*

9051

++ * Select a queue for service.  If we have a current queue in service,

9052

++ * check whether to continue servicing it, or retrieve and set a new one.

9053

++ */

9054

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

9055

++{

9056

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

9057

++	struct request *next_rq;

9058

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

9059

++

9060

++	bfqq = bfqd->in_service_queue;

9061

++	if (bfqq == NULL)

9062

++		goto new_queue;

9063

++

9064

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

9065

++

9066

++	/*

9067

++         * If another queue has a request waiting within our mean seek

9068

++         * distance, let it run. The expire code will check for close

9069

++         * cooperators and put the close queue at the front of the

9070

++         * service tree. If possible, merge the expiring queue with the

9071

++         * new bfqq.

9072

++         */

9073

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

9074

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

9075

++                bfq_setup_merge(bfqq, new_bfqq);

9076

++

9077

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

9078

++	    !timer_pending(&bfqd->idle_slice_timer) &&

9079

++	    !bfq_bfqq_must_idle(bfqq))

9080

++		goto expire;

9081

++

9082

++	next_rq = bfqq->next_rq;

9083

++	/*

9084

++	 * If bfqq has requests queued and it has enough budget left to

9085

++	 * serve them, keep the queue, otherwise expire it.

9086

++	 */

9087

++	if (next_rq != NULL) {

9088

++		if (bfq_serv_to_charge(next_rq, bfqq) >

9089

++			bfq_bfqq_budget_left(bfqq)) {

9090

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

9091

++			goto expire;

9092

++		} else {

9093

++			/*

9094

++			 * The idle timer may be pending because we may not

9095

++			 * disable disk idling even when a new request arrives

9096

++			 */

9097

++			if (timer_pending(&bfqd->idle_slice_timer)) {

9098

++				/*

9099

++				 * If we get here: 1) at least a new request

9100

++				 * has arrived but we have not disabled the

9101

++				 * timer because the request was too small,

9102

++				 * 2) then the block layer has unplugged the

9103

++				 * device, causing the dispatch to be invoked.

9104

++				 *

9105

++				 * Since the device is unplugged, now the

9106

++				 * requests are probably large enough to

9107

++				 * provide a reasonable throughput.

9108

++				 * So we disable idling.

9109

++				 */

9110

++				bfq_clear_bfqq_wait_request(bfqq);

9111

++				del_timer(&bfqd->idle_slice_timer);

9112

++			}

9113

++			if (new_bfqq == NULL)

9114

++				goto keep_queue;

9115

++			else

9116

++				goto expire;

9117

++		}

9118

++	}

9119

++

9120

++	/*

9121

++	 * No requests pending.  If the in-service queue has no cooperator and

9122

++	 * still has requests in flight (possibly waiting for a completion)

9123

++	 * or is idling for a new request, then keep it.

9124

++	 */

9125

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

9126

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

9127

++		bfqq = NULL;

9128

++		goto keep_queue;

9129

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

9130

++		/*

9131

++		 * Expiring the queue because there is a close cooperator,

9132

++		 * cancel timer.

9133

++		 */

9134

++		bfq_clear_bfqq_wait_request(bfqq);

9135

++		del_timer(&bfqd->idle_slice_timer);

9136

++	}

9137

++

9138

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

9139

++expire:

9140

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

9141

++new_queue:

9142

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

9143

++	bfq_log(bfqd, "select_queue: new queue %d returned",

9144

++		bfqq != NULL ? bfqq->pid : 0);

9145

++keep_queue:

9146

++	return bfqq;

9147

++}

9148

++

9149

++static void bfq_update_raising_data(struct bfq_data *bfqd,

9150

++				    struct bfq_queue *bfqq)

9151

++{

9152

++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

9153

++		struct bfq_entity *entity = &bfqq->entity;

9154

++

9155

++		bfq_log_bfqq(bfqd, bfqq,

9156

++			"raising period dur %u/%u msec, "

9157

++			"old raising coeff %u, w %d(%d)",

9158

++			jiffies_to_msecs(jiffies -

9159

++				bfqq->last_rais_start_finish),

9160

++			jiffies_to_msecs(bfqq->raising_cur_max_time),

9161

++			bfqq->raising_coeff,

9162

++			bfqq->entity.weight, bfqq->entity.orig_weight);

9163

++

9164

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

9165

++			entity->orig_weight * bfqq->raising_coeff);

9166

++		if (entity->ioprio_changed)

9167

++			bfq_log_bfqq(bfqd, bfqq,

9168

++			"WARN: pending prio change");

9169

++		/*

9170

++		 * If too much time has elapsed from the beginning

9171

++		 * of this weight-raising, stop it.

9172

++		 */

9173

++		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

9174

++					   bfqq->raising_cur_max_time)) {

9175

++			bfqq->last_rais_start_finish = jiffies;

9176

++			bfq_log_bfqq(bfqd, bfqq,

9177

++				     "wrais ending at %lu, "

9178

++				     "rais_max_time %u",

9179

++				     bfqq->last_rais_start_finish,

9180

++				     jiffies_to_msecs(bfqq->

9181

++					raising_cur_max_time));

9182

++			bfq_bfqq_end_raising(bfqq);

9183

++			__bfq_entity_update_weight_prio(

9184

++				bfq_entity_service_tree(entity),

9185

++				entity);

9186

++		}

9187

++	}

9188

++}

9189

++

9190

++/*

9191

++ * Dispatch one request from bfqq, moving it to the request queue

9192

++ * dispatch list.

9193

++ */

9194

++static int bfq_dispatch_request(struct bfq_data *bfqd,

9195

++				struct bfq_queue *bfqq)

9196

++{

9197

++	int dispatched = 0;

9198

++	struct request *rq;

9199

++	unsigned long service_to_charge;

9200

++

9201

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

9202

++

9203

++	/* Follow expired path, else get first next available. */

9204

++	rq = bfq_check_fifo(bfqq);

9205

++	if (rq == NULL)

9206

++		rq = bfqq->next_rq;

9207

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

9208

++

9209

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

9210

++		/*

9211

++		 * This may happen if the next rq is chosen

9212

++		 * in fifo order instead of sector order.

9213

++		 * The budget is properly dimensioned

9214

++		 * to be always sufficient to serve the next request

9215

++		 * only if it is chosen in sector order. The reason is

9216

++		 * that it would be quite inefficient and little useful

9217

++		 * to always make sure that the budget is large enough

9218

++		 * to serve even the possible next rq in fifo order.

9219

++		 * In fact, requests are seldom served in fifo order.

9220

++		 *

9221

++		 * Expire the queue for budget exhaustion, and

9222

++		 * make sure that the next act_budget is enough

9223

++		 * to serve the next request, even if it comes

9224

++		 * from the fifo expired path.

9225

++		 */

9226

++		bfqq->next_rq = rq;

9227

++		/*

9228

++		 * Since this dispatch is failed, make sure that

9229

++		 * a new one will be performed

9230

++		 */

9231

++		if (!bfqd->rq_in_driver)

9232

++			bfq_schedule_dispatch(bfqd);

9233

++		goto expire;

9234

++	}

9235

++

9236

++	/* Finally, insert request into driver dispatch list. */

9237

++	bfq_bfqq_served(bfqq, service_to_charge);

9238

++	bfq_dispatch_insert(bfqd->queue, rq);

9239

++

9240

++	bfq_update_raising_data(bfqd, bfqq);

9241

++

9242

++	bfq_log_bfqq(bfqd, bfqq,

9243

++			"dispatched %u sec req (%llu), budg left %lu",

9244

++			blk_rq_sectors(rq),

9245

++			(long long unsigned)blk_rq_pos(rq),

9246

++			bfq_bfqq_budget_left(bfqq));

9247

++

9248

++	dispatched++;

9249

++

9250

++	if (bfqd->in_service_bic == NULL) {

9251

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

9252

++		bfqd->in_service_bic = RQ_BIC(rq);

9253

++	}

9254

++

9255

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

9256

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

9257

++	    bfq_class_idle(bfqq)))

9258

++		goto expire;

9259

++

9260

++	return dispatched;

9261

++

9262

++expire:

9263

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

9264

++	return dispatched;

9265

++}

9266

++

9267

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

9268

++{

9269

++	int dispatched = 0;

9270

++

9271

++	while (bfqq->next_rq != NULL) {

9272

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

9273

++		dispatched++;

9274

++	}

9275

++

9276

++	BUG_ON(!list_empty(&bfqq->fifo));

9277

++	return dispatched;

9278

++}

9279

++

9280

++/*

9281

++ * Drain our current requests.  Used for barriers and when switching

9282

++ * io schedulers on-the-fly.

9283

++ */

9284

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

9285

++{

9286

++	struct bfq_queue *bfqq, *n;

9287

++	struct bfq_service_tree *st;

9288

++	int dispatched = 0;

9289

++

9290

++	bfqq = bfqd->in_service_queue;

9291

++	if (bfqq != NULL)

9292

++		__bfq_bfqq_expire(bfqd, bfqq);

9293

++

9294

++	/*

9295

++	 * Loop through classes, and be careful to leave the scheduler

9296

++	 * in a consistent state, as feedback mechanisms and vtime

9297

++	 * updates cannot be disabled during the process.

9298

++	 */

9299

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

9300

++		st = bfq_entity_service_tree(&bfqq->entity);

9301

++

9302

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

9303

++		bfqq->max_budget = bfq_max_budget(bfqd);

9304

++

9305

++		bfq_forget_idle(st);

9306

++	}

9307

++

9308

++	BUG_ON(bfqd->busy_queues != 0);

9309

++

9310

++	return dispatched;

9311

++}

9312

++

9313

++static int bfq_dispatch_requests(struct request_queue *q, int force)

9314

++{

9315

++	struct bfq_data *bfqd = q->elevator->elevator_data;

9316

++	struct bfq_queue *bfqq;

9317

++	int max_dispatch;

9318

++

9319

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

9320

++	if (bfqd->busy_queues == 0)

9321

++		return 0;

9322

++

9323

++	if (unlikely(force))

9324

++		return bfq_forced_dispatch(bfqd);

9325

++

9326

++	bfqq = bfq_select_queue(bfqd);

9327

++	if (bfqq == NULL)

9328

++		return 0;

9329

++

9330

++	max_dispatch = bfqd->bfq_quantum;

9331

++	if (bfq_class_idle(bfqq))

9332

++		max_dispatch = 1;

9333

++

9334

++	if (!bfq_bfqq_sync(bfqq))

9335

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

9336

++

9337

++	if (bfqq->dispatched >= max_dispatch) {

9338

++		if (bfqd->busy_queues > 1)

9339

++			return 0;

9340

++		if (bfqq->dispatched >= 4 * max_dispatch)

9341

++			return 0;

9342

++	}

9343

++

9344

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

9345

++		return 0;

9346

++

9347

++	bfq_clear_bfqq_wait_request(bfqq);

9348

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

9349

++

9350

++	if (!bfq_dispatch_request(bfqd, bfqq))

9351

++		return 0;

9352

++

9353

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

9354

++			bfqq->pid, max_dispatch);

9355

++

9356

++	return 1;

9357

++}

9358

++

9359

++/*

9360

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

9361

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

9362

++ *

9363

++ * Queue lock must be held here.

9364

++ */

9365

++static void bfq_put_queue(struct bfq_queue *bfqq)

9366

++{

9367

++	struct bfq_data *bfqd = bfqq->bfqd;

9368

++

9369

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

9370

++

9371

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

9372

++		     atomic_read(&bfqq->ref));

9373

++	if (!atomic_dec_and_test(&bfqq->ref))

9374

++		return;

9375

++

9376

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

9377

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

9378

++	BUG_ON(bfqq->entity.tree != NULL);

9379

++	BUG_ON(bfq_bfqq_busy(bfqq));

9380

++	BUG_ON(bfqd->in_service_queue == bfqq);

9381

++

9382

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

9383

++

9384

++	kmem_cache_free(bfq_pool, bfqq);

9385

++}

9386

++

9387

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

9388

++{

9389

++	struct bfq_queue *__bfqq, *next;

9390

++

9391

++	/*

9392

++	 * If this queue was scheduled to merge with another queue, be

9393

++	 * sure to drop the reference taken on that queue (and others in

9394

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

9395

++	 */

9396

++	__bfqq = bfqq->new_bfqq;

9397

++	while (__bfqq) {

9398

++		if (__bfqq == bfqq) {

9399

++			WARN(1, "bfqq->new_bfqq loop detected.\n");

9400

++			break;

9401

++		}

9402

++		next = __bfqq->new_bfqq;

9403

++		bfq_put_queue(__bfqq);

9404

++		__bfqq = next;

9405

++	}

9406

++}

9407

++

9408

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9409

++{

9410

++	if (bfqq == bfqd->in_service_queue) {

9411

++		__bfq_bfqq_expire(bfqd, bfqq);

9412

++		bfq_schedule_dispatch(bfqd);

9413

++	}

9414

++

9415

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

9416

++		     atomic_read(&bfqq->ref));

9417

++

9418

++	bfq_put_cooperator(bfqq);

9419

++

9420

++	bfq_put_queue(bfqq);

9421

++}

9422

++

9423

++static void bfq_init_icq(struct io_cq *icq)

9424

++{

9425

++	struct bfq_io_cq *bic = icq_to_bic(icq);

9426

++

9427

++	bic->ttime.last_end_request = jiffies;

9428

++}

9429

++

9430

++static void bfq_exit_icq(struct io_cq *icq)

9431

++{

9432

++	struct bfq_io_cq *bic = icq_to_bic(icq);

9433

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

9434

++

9435

++	if (bic->bfqq[BLK_RW_ASYNC]) {

9436

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

9437

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

9438

++	}

9439

++

9440

++	if (bic->bfqq[BLK_RW_SYNC]) {

9441

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

9442

++		bic->bfqq[BLK_RW_SYNC] = NULL;

9443

++	}

9444

++}

9445

++

9446

++/*

9447

++ * Update the entity prio values; note that the new values will not

9448

++ * be used until the next (re)activation.

9449

++ */

9450

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

9451

++{

9452

++	struct task_struct *tsk = current;

9453

++	int ioprio_class;

9454

++

9455

++	if (!bfq_bfqq_prio_changed(bfqq))

9456

++		return;

9457

++

9458

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9459

++	switch (ioprio_class) {

9460

++	default:

9461

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

9462

++			"bfq: bad prio %x\n", ioprio_class);

9463

++	case IOPRIO_CLASS_NONE:

9464

++		/*

9465

++		 * No prio set, inherit CPU scheduling settings.

9466

++		 */

9467

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

9468

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

9469

++		break;

9470

++	case IOPRIO_CLASS_RT:

9471

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9472

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

9473

++		break;

9474

++	case IOPRIO_CLASS_BE:

9475

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9476

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

9477

++		break;

9478

++	case IOPRIO_CLASS_IDLE:

9479

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

9480

++		bfqq->entity.new_ioprio = 7;

9481

++		bfq_clear_bfqq_idle_window(bfqq);

9482

++		break;

9483

++	}

9484

++

9485

++	bfqq->entity.ioprio_changed = 1;

9486

++

9487

++	/*

9488

++	 * Keep track of original prio settings in case we have to temporarily

9489

++	 * elevate the priority of this queue.

9490

++	 */

9491

++	bfqq->org_ioprio = bfqq->entity.new_ioprio;

9492

++	bfq_clear_bfqq_prio_changed(bfqq);

9493

++}

9494

++

9495

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

9496

++{

9497

++	struct bfq_data *bfqd;

9498

++	struct bfq_queue *bfqq, *new_bfqq;

9499

++	struct bfq_group *bfqg;

9500

++	unsigned long uninitialized_var(flags);

9501

++	int ioprio = bic->icq.ioc->ioprio;

9502

++

9503

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

9504

++				   &flags);

9505

++	/*

9506

++	 * This condition may trigger on a newly created bic, be sure to drop

9507

++	 * the lock before returning.

9508

++	 */

9509

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

9510

++		goto out;

9511

++

9512

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

9513

++	if (bfqq != NULL) {

9514

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

9515

++				    sched_data);

9516

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

9517

++					 GFP_ATOMIC);

9518

++		if (new_bfqq != NULL) {

9519

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

9520

++			bfq_log_bfqq(bfqd, bfqq,

9521

++				     "changed_ioprio: bfqq %p %d",

9522

++				     bfqq, atomic_read(&bfqq->ref));

9523

++			bfq_put_queue(bfqq);

9524

++		}

9525

++	}

9526

++

9527

++	bfqq = bic->bfqq[BLK_RW_SYNC];

9528

++	if (bfqq != NULL)

9529

++		bfq_mark_bfqq_prio_changed(bfqq);

9530

++

9531

++	bic->ioprio = ioprio;

9532

++

9533

++out:

9534

++	bfq_put_bfqd_unlock(bfqd, &flags);

9535

++}

9536

++

9537

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9538

++			  pid_t pid, int is_sync)

9539

++{

9540

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

9541

++	INIT_LIST_HEAD(&bfqq->fifo);

9542

++

9543

++	atomic_set(&bfqq->ref, 0);

9544

++	bfqq->bfqd = bfqd;

9545

++

9546

++	bfq_mark_bfqq_prio_changed(bfqq);

9547

++

9548

++	if (is_sync) {

9549

++		if (!bfq_class_idle(bfqq))

9550

++			bfq_mark_bfqq_idle_window(bfqq);

9551

++		bfq_mark_bfqq_sync(bfqq);

9552

++	}

9553

++

9554

++	/* Tentative initial value to trade off between thr and lat */

9555

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

9556

++	bfqq->pid = pid;

9557

++

9558

++	bfqq->raising_coeff = 1;

9559

++	bfqq->last_rais_start_finish = 0;

9560

++	/*

9561

++	 * Set to the value for which bfqq will not be deemed as

9562

++	 * soft rt when it becomes backlogged.

9563

++	 */

9564

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

9565

++}

9566

++

9567

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

9568

++					      struct bfq_group *bfqg,

9569

++					      int is_sync,

9570

++					      struct bfq_io_cq *bic,

9571

++					      gfp_t gfp_mask)

9572

++{

9573

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

9574

++

9575

++retry:

9576

++	/* bic always exists here */

9577

++	bfqq = bic_to_bfqq(bic, is_sync);

9578

++

9579

++	/*

9580

++	 * Always try a new alloc if we fall back to the OOM bfqq

9581

++	 * originally, since it should just be a temporary situation.

9582

++	 */

9583

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

9584

++		bfqq = NULL;

9585

++		if (new_bfqq != NULL) {

9586

++			bfqq = new_bfqq;

9587

++			new_bfqq = NULL;

9588

++		} else if (gfp_mask & __GFP_WAIT) {

9589

++			spin_unlock_irq(bfqd->queue->queue_lock);

9590

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

9591

++					gfp_mask | __GFP_ZERO,

9592

++					bfqd->queue->node);

9593

++			spin_lock_irq(bfqd->queue->queue_lock);

9594

++			if (new_bfqq != NULL)

9595

++				goto retry;

9596

++		} else {

9597

++			bfqq = kmem_cache_alloc_node(bfq_pool,

9598

++					gfp_mask | __GFP_ZERO,

9599

++					bfqd->queue->node);

9600

++		}

9601

++

9602

++		if (bfqq != NULL) {

9603

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

9604

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

9605

++		} else {

9606

++			bfqq = &bfqd->oom_bfqq;

9607

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

9608

++		}

9609

++

9610

++		bfq_init_prio_data(bfqq, bic);

9611

++		bfq_init_entity(&bfqq->entity, bfqg);

9612

++	}

9613

++

9614

++	if (new_bfqq != NULL)

9615

++		kmem_cache_free(bfq_pool, new_bfqq);

9616

++

9617

++	return bfqq;

9618

++}

9619

++

9620

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

9621

++					       struct bfq_group *bfqg,

9622

++					       int ioprio_class, int ioprio)

9623

++{

9624

++	switch (ioprio_class) {

9625

++	case IOPRIO_CLASS_RT:

9626

++		return &bfqg->async_bfqq[0][ioprio];

9627

++	case IOPRIO_CLASS_NONE:

9628

++		ioprio = IOPRIO_NORM;

9629

++		/* fall through */

9630

++	case IOPRIO_CLASS_BE:

9631

++		return &bfqg->async_bfqq[1][ioprio];

9632

++	case IOPRIO_CLASS_IDLE:

9633

++		return &bfqg->async_idle_bfqq;

9634

++	default:

9635

++		BUG();

9636

++	}

9637

++}

9638

++

9639

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

9640

++				       struct bfq_group *bfqg, int is_sync,

9641

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

9642

++{

9643

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9644

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9645

++	struct bfq_queue **async_bfqq = NULL;

9646

++	struct bfq_queue *bfqq = NULL;

9647

++

9648

++	if (!is_sync) {

9649

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

9650

++						  ioprio);

9651

++		bfqq = *async_bfqq;

9652

++	}

9653

++

9654

++	if (bfqq == NULL)

9655

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

9656

++

9657

++	/*

9658

++	 * Pin the queue now that it's allocated, scheduler exit will prune it.

9659

++	 */

9660

++	if (!is_sync && *async_bfqq == NULL) {

9661

++		atomic_inc(&bfqq->ref);

9662

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

9663

++			     bfqq, atomic_read(&bfqq->ref));

9664

++		*async_bfqq = bfqq;

9665

++	}

9666

++

9667

++	atomic_inc(&bfqq->ref);

9668

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

9669

++		     atomic_read(&bfqq->ref));

9670

++	return bfqq;

9671

++}

9672

++

9673

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

9674

++				    struct bfq_io_cq *bic)

9675

++{

9676

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

9677

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

9678

++

9679

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

9680

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

9681

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

9682

++				bic->ttime.ttime_samples;

9683

++}

9684

++

9685

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

9686

++				   struct bfq_queue *bfqq,

9687

++				   struct request *rq)

9688

++{

9689

++	sector_t sdist;

9690

++	u64 total;

9691

++

9692

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

9693

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

9694

++	else

9695

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

9696

++

9697

++	/*

9698

++	 * Don't allow the seek distance to get too large from the

9699

++	 * odd fragment, pagein, etc.

9700

++	 */

9701

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

9702

++		sdist = 0;

9703

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

9704

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

9705

++	else

9706

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

9707

++

9708

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

9709

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

9710

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

9711

++	do_div(total, bfqq->seek_samples);

9712

++	bfqq->seek_mean = (sector_t)total;

9713

++

9714

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

9715

++			(u64)bfqq->seek_mean);

9716

++}

9717

++

9718

++/*

9719

++ * Disable idle window if the process thinks too long or seeks so much that

9720

++ * it doesn't matter.

9721

++ */

9722

++static void bfq_update_idle_window(struct bfq_data *bfqd,

9723

++				   struct bfq_queue *bfqq,

9724

++				   struct bfq_io_cq *bic)

9725

++{

9726

++	int enable_idle;

9727

++

9728

++	/* Don't idle for async or idle io prio class. */

9729

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

9730

++		return;

9731

++

9732

++	enable_idle = bfq_bfqq_idle_window(bfqq);

9733

++

9734

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

9735

++	    bfqd->bfq_slice_idle == 0 ||

9736

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

9737

++			bfqq->raising_coeff == 1))

9738

++		enable_idle = 0;

9739

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

9740

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

9741

++			bfqq->raising_coeff == 1)

9742

++			enable_idle = 0;

9743

++		else

9744

++			enable_idle = 1;

9745

++	}

9746

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

9747

++		enable_idle);

9748

++

9749

++	if (enable_idle)

9750

++		bfq_mark_bfqq_idle_window(bfqq);

9751

++	else

9752

++		bfq_clear_bfqq_idle_window(bfqq);

9753

++}

9754

++

9755

++/*

9756

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

9757

++ * something we should do about it.

9758

++ */

9759

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9760

++			    struct request *rq)

9761

++{

9762

++	struct bfq_io_cq *bic = RQ_BIC(rq);

9763

++

9764

++	if (rq->cmd_flags & REQ_META)

9765

++		bfqq->meta_pending++;

9766

++

9767

++	bfq_update_io_thinktime(bfqd, bic);

9768

++	bfq_update_io_seektime(bfqd, bfqq, rq);

9769

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

9770

++	    !BFQQ_SEEKY(bfqq))

9771

++		bfq_update_idle_window(bfqd, bfqq, bic);

9772

++

9773

++	bfq_log_bfqq(bfqd, bfqq,

9774

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

9775

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

9776

++		     (long long unsigned)bfqq->seek_mean);

9777

++

9778

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

9779

++

9780

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

9781

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

9782

++				blk_rq_sectors(rq) < 32;

9783

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

9784

++

9785

++		/*

9786

++		 * There is just this request queued: if the request

9787

++		 * is small and the queue is not to be expired, then

9788

++		 * just exit.

9789

++		 *

9790

++		 * In this way, if the disk is being idled to wait for

9791

++		 * a new request from the in-service queue, we avoid

9792

++		 * unplugging the device and committing the disk to serve

9793

++		 * just a small request. On the contrary, we wait for

9794

++		 * the block layer to decide when to unplug the device:

9795

++		 * hopefully, new requests will be merged to this one

9796

++		 * quickly, then the device will be unplugged and

9797

++		 * larger requests will be dispatched.

9798

++		 */

9799

++		if (small_req && !budget_timeout)

9800

++			return;

9801

++

9802

++		/*

9803

++		 * A large enough request arrived, or the queue is to

9804

++		 * be expired: in both cases disk idling is to be

9805

++		 * stopped, so clear wait_request flag and reset

9806

++		 * timer.

9807

++		 */

9808

++		bfq_clear_bfqq_wait_request(bfqq);

9809

++		del_timer(&bfqd->idle_slice_timer);

9810

++

9811

++		/*

9812

++		 * The queue is not empty, because a new request just

9813

++		 * arrived. Hence we can safely expire the queue, in

9814

++		 * case of budget timeout, without risking that the

9815

++		 * timestamps of the queue are not updated correctly.

9816

++		 * See [1] for more details.

9817

++		 */

9818

++		if (budget_timeout)

9819

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

9820

++

9821

++		/*

9822

++		 * Let the request rip immediately, or let a new queue be

9823

++		 * selected if bfqq has just been expired.

9824

++		 */

9825

++		__blk_run_queue(bfqd->queue);

9826

++	}

9827

++}

9828

++

9829

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

9830

++{

9831

++	struct bfq_data *bfqd = q->elevator->elevator_data;

9832

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9833

++

9834

++	assert_spin_locked(bfqd->queue->queue_lock);

9835

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

9836

++

9837

++	bfq_add_rq_rb(rq);

9838

++

9839

++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

9840

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

9841

++

9842

++	bfq_rq_enqueued(bfqd, bfqq, rq);

9843

++}

9844

++

9845

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

9846

++{

9847

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

9848

++				     bfqd->rq_in_driver);

9849

++

9850

++	if (bfqd->hw_tag == 1)

9851

++		return;

9852

++

9853

++	/*

9854

++	 * This sample is valid if the number of outstanding requests

9855

++	 * is large enough to allow a queueing behavior.  Note that the

9856

++	 * sum is not exact, as it's not taking into account deactivated

9857

++	 * requests.

9858

++	 */

9859

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

9860

++		return;

9861

++

9862

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

9863

++		return;

9864

++

9865

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

9866

++	bfqd->max_rq_in_driver = 0;

9867

++	bfqd->hw_tag_samples = 0;

9868

++}

9869

++

9870

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

9871

++{

9872

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9873

++	struct bfq_data *bfqd = bfqq->bfqd;

9874

++	const int sync = rq_is_sync(rq);

9875

++

9876

++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

9877

++			blk_rq_sectors(rq), sync);

9878

++

9879

++	bfq_update_hw_tag(bfqd);

9880

++

9881

++	WARN_ON(!bfqd->rq_in_driver);

9882

++	WARN_ON(!bfqq->dispatched);

9883

++	bfqd->rq_in_driver--;

9884

++	bfqq->dispatched--;

9885

++

9886

++	if (bfq_bfqq_sync(bfqq))

9887

++		bfqd->sync_flight--;

9888

++

9889

++	if (sync)

9890

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

9891

++

9892

++	/*

9893

++	 * The computation of softrt_next_start was scheduled for the next

9894

++	 * request completion: it is now time to compute it.

9895

++	 */

9896

++	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

9897

++		bfqq->soft_rt_next_start =

9898

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

9899

++

9900

++	/*

9901

++	 * If this is the in-service queue, check if it needs to be expired,

9902

++	 * or if we want to idle in case it has no pending requests.

9903

++	 */

9904

++	if (bfqd->in_service_queue == bfqq) {

9905

++		if (bfq_bfqq_budget_new(bfqq))

9906

++			bfq_set_budget_timeout(bfqd);

9907

++

9908

++		if (bfq_bfqq_must_idle(bfqq)) {

9909

++			bfq_arm_slice_timer(bfqd);

9910

++			goto out;

9911

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

9912

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

9913

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

9914

++			 (bfqq->dispatched == 0 ||

9915

++			  !bfq_bfqq_must_not_expire(bfqq)))

9916

++			bfq_bfqq_expire(bfqd, bfqq, 0,

9917

++					BFQ_BFQQ_NO_MORE_REQUESTS);

9918

++	}

9919

++

9920

++	if (!bfqd->rq_in_driver)

9921

++		bfq_schedule_dispatch(bfqd);

9922

++

9923

++out:

9924

++	return;

9925

++}

9926

++

9927

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

9928

++{

9929

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

9930

++		bfq_clear_bfqq_must_alloc(bfqq);

9931

++		return ELV_MQUEUE_MUST;

9932

++	}

9933

++

9934

++	return ELV_MQUEUE_MAY;

9935

++}

9936

++

9937

++static int bfq_may_queue(struct request_queue *q, int rw)

9938

++{

9939

++	struct bfq_data *bfqd = q->elevator->elevator_data;

9940

++	struct task_struct *tsk = current;

9941

++	struct bfq_io_cq *bic;

9942

++	struct bfq_queue *bfqq;

9943

++

9944

++	/*

9945

++	 * Don't force setup of a queue from here, as a call to may_queue

9946

++	 * does not necessarily imply that a request actually will be queued.

9947

++	 * So just lookup a possibly existing queue, or return 'may queue'

9948

++	 * if that fails.

9949

++	 */

9950

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

9951

++	if (bic == NULL)

9952

++		return ELV_MQUEUE_MAY;

9953

++

9954

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

9955

++	if (bfqq != NULL) {

9956

++		bfq_init_prio_data(bfqq, bic);

9957

++

9958

++		return __bfq_may_queue(bfqq);

9959

++	}

9960

++

9961

++	return ELV_MQUEUE_MAY;

9962

++}

9963

++

9964

++/*

9965

++ * Queue lock held here.

9966

++ */

9967

++static void bfq_put_request(struct request *rq)

9968

++{

9969

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9970

++

9971

++	if (bfqq != NULL) {

9972

++		const int rw = rq_data_dir(rq);

9973

++

9974

++		BUG_ON(!bfqq->allocated[rw]);

9975

++		bfqq->allocated[rw]--;

9976

++

9977

++		rq->elv.priv[0] = NULL;

9978

++		rq->elv.priv[1] = NULL;

9979

++

9980

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

9981

++			     bfqq, atomic_read(&bfqq->ref));

9982

++		bfq_put_queue(bfqq);

9983

++	}

9984

++}

9985

++

9986

++static struct bfq_queue *

9987

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

9988

++		struct bfq_queue *bfqq)

9989

++{

9990

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

9991

++		(long unsigned)bfqq->new_bfqq->pid);

9992

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

9993

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

9994

++	bfq_put_queue(bfqq);

9995

++	return bic_to_bfqq(bic, 1);

9996

++}

9997

++

9998

++/*

9999

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

10000

++ * was the last process referring to said bfqq.

10001

++ */

10002

++static struct bfq_queue *

10003

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

10004

++{

10005

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

10006

++	if (bfqq_process_refs(bfqq) == 1) {

10007

++		bfqq->pid = current->pid;

10008

++		bfq_clear_bfqq_coop(bfqq);

10009

++		bfq_clear_bfqq_split_coop(bfqq);

10010

++		return bfqq;

10011

++	}

10012

++

10013

++	bic_set_bfqq(bic, NULL, 1);

10014

++

10015

++	bfq_put_cooperator(bfqq);

10016

++

10017

++	bfq_put_queue(bfqq);

10018

++	return NULL;

10019

++}

10020

++

10021

++/*

10022

++ * Allocate bfq data structures associated with this request.

10023

++ */

10024

++static int bfq_set_request(struct request_queue *q, struct request *rq,

10025

++			   struct bio *bio, gfp_t gfp_mask)

10026

++{

10027

++	struct bfq_data *bfqd = q->elevator->elevator_data;

10028

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

10029

++	const int rw = rq_data_dir(rq);

10030

++	const int is_sync = rq_is_sync(rq);

10031

++	struct bfq_queue *bfqq;

10032

++	struct bfq_group *bfqg;

10033

++	unsigned long flags;

10034

++

10035

++	might_sleep_if(gfp_mask & __GFP_WAIT);

10036

++

10037

++	bfq_changed_ioprio(bic);

10038

++

10039

++	spin_lock_irqsave(q->queue_lock, flags);

10040

++

10041

++	if (bic == NULL)

10042

++		goto queue_fail;

10043

++

10044

++	bfqg = bfq_bic_update_cgroup(bic);

10045

++

10046

++new_queue:

10047

++	bfqq = bic_to_bfqq(bic, is_sync);

10048

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

10049

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

10050

++		bic_set_bfqq(bic, bfqq, is_sync);

10051

++	} else {

10052

++		/*

10053

++		 * If the queue was seeky for too long, break it apart.

10054

++		 */

10055

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

10056

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

10057

++			bfqq = bfq_split_bfqq(bic, bfqq);

10058

++			if (!bfqq)

10059

++				goto new_queue;

10060

++		}

10061

++

10062

++		/*

10063

++		 * Check to see if this queue is scheduled to merge with

10064

++		 * another closely cooperating queue. The merging of queues

10065

++		 * happens here as it must be done in process context.

10066

++		 * The reference on new_bfqq was taken in merge_bfqqs.

10067

++		 */

10068

++		if (bfqq->new_bfqq != NULL)

10069

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

10070

++	}

10071

++

10072

++	bfqq->allocated[rw]++;

10073

++	atomic_inc(&bfqq->ref);

10074

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

10075

++		     atomic_read(&bfqq->ref));

10076

++

10077

++	rq->elv.priv[0] = bic;

10078

++	rq->elv.priv[1] = bfqq;

10079

++

10080

++	spin_unlock_irqrestore(q->queue_lock, flags);

10081

++

10082

++	return 0;

10083

++

10084

++queue_fail:

10085

++	bfq_schedule_dispatch(bfqd);

10086

++	spin_unlock_irqrestore(q->queue_lock, flags);

10087

++

10088

++	return 1;

10089

++}

10090

++

10091

++static void bfq_kick_queue(struct work_struct *work)

10092

++{

10093

++	struct bfq_data *bfqd =

10094

++		container_of(work, struct bfq_data, unplug_work);

10095

++	struct request_queue *q = bfqd->queue;

10096

++

10097

++	spin_lock_irq(q->queue_lock);

10098

++	__blk_run_queue(q);

10099

++	spin_unlock_irq(q->queue_lock);

10100

++}

10101

++

10102

++/*

10103

++ * Handler of the expiration of the timer running if the in-service queue

10104

++ * is idling inside its time slice.

10105

++ */

10106

++static void bfq_idle_slice_timer(unsigned long data)

10107

++{

10108

++	struct bfq_data *bfqd = (struct bfq_data *)data;

10109

++	struct bfq_queue *bfqq;

10110

++	unsigned long flags;

10111

++	enum bfqq_expiration reason;

10112

++

10113

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

10114

++

10115

++	bfqq = bfqd->in_service_queue;

10116

++	/*

10117

++	 * Theoretical race here: the in-service queue can be NULL or different

10118

++	 * from the queue that was idling if the timer handler spins on

10119

++	 * the queue_lock and a new request arrives for the current

10120

++	 * queue and there is a full dispatch cycle that changes the

10121

++	 * in-service queue.  This can hardly happen, but in the worst case

10122

++	 * we just expire a queue too early.

10123

++	 */

10124

++	if (bfqq != NULL) {

10125

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

10126

++		if (bfq_bfqq_budget_timeout(bfqq))

10127

++			/*

10128

++			 * Also here the queue can be safely expired

10129

++			 * for budget timeout without wasting

10130

++			 * guarantees

10131

++			 */

10132

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

10133

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

10134

++			/*

10135

++			 * The queue may not be empty upon timer expiration,

10136

++			 * because we may not disable the timer when the first

10137

++			 * request of the in-service queue arrives during

10138

++			 * disk idling

10139

++			 */

10140

++			reason = BFQ_BFQQ_TOO_IDLE;

10141

++		else

10142

++			goto schedule_dispatch;

10143

++

10144

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

10145

++	}

10146

++

10147

++schedule_dispatch:

10148

++	bfq_schedule_dispatch(bfqd);

10149

++

10150

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

10151

++}

10152

++

10153

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

10154

++{

10155

++	del_timer_sync(&bfqd->idle_slice_timer);

10156

++	cancel_work_sync(&bfqd->unplug_work);

10157

++}

10158

++

10159

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

10160

++					struct bfq_queue **bfqq_ptr)

10161

++{

10162

++	struct bfq_group *root_group = bfqd->root_group;

10163

++	struct bfq_queue *bfqq = *bfqq_ptr;

10164

++

10165

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

10166

++	if (bfqq != NULL) {

10167

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

10168

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

10169

++			     bfqq, atomic_read(&bfqq->ref));

10170

++		bfq_put_queue(bfqq);

10171

++		*bfqq_ptr = NULL;

10172

++	}

10173

++}

10174

++

10175

++/*

10176

++ * Release all the bfqg references to its async queues.  If we are

10177

++ * deallocating the group these queues may still contain requests, so

10178

++ * we reparent them to the root cgroup (i.e., the only one that will

10179

++ * exist for sure untill all the requests on a device are gone).

10180

++ */

10181

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

10182

++{

10183

++	int i, j;

10184

++

10185

++	for (i = 0; i < 2; i++)

10186

++		for (j = 0; j < IOPRIO_BE_NR; j++)

10187

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

10188

++

10189

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

10190

++}

10191

++

10192

++static void bfq_exit_queue(struct elevator_queue *e)

10193

++{

10194

++	struct bfq_data *bfqd = e->elevator_data;

10195

++	struct request_queue *q = bfqd->queue;

10196

++	struct bfq_queue *bfqq, *n;

10197

++

10198

++	bfq_shutdown_timer_wq(bfqd);

10199

++

10200

++	spin_lock_irq(q->queue_lock);

10201

++

10202

++	BUG_ON(bfqd->in_service_queue != NULL);

10203

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

10204

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

10205

++

10206

++	bfq_disconnect_groups(bfqd);

10207

++	spin_unlock_irq(q->queue_lock);

10208

++

10209

++	bfq_shutdown_timer_wq(bfqd);

10210

++

10211

++	synchronize_rcu();

10212

++

10213

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

10214

++

10215

++	bfq_free_root_group(bfqd);

10216

++	kfree(bfqd);

10217

++}

10218

++

10219

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

10220

++{

10221

++	struct bfq_group *bfqg;

10222

++	struct bfq_data *bfqd;

10223

++	struct elevator_queue *eq;

10224

++

10225

++	eq = elevator_alloc(q, e);

10226

++	if (eq == NULL)

10227

++		return -ENOMEM;

10228

++

10229

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

10230

++	if (bfqd == NULL) {

10231

++		kobject_put(&eq->kobj);

10232

++		return -ENOMEM;

10233

++	}

10234

++	eq->elevator_data = bfqd;

10235

++

10236

++	/*

10237

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

10238

++	 * Grab a permanent reference to it, so that the normal code flow

10239

++	 * will not attempt to free it.

10240

++	 */

10241

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

10242

++	atomic_inc(&bfqd->oom_bfqq.ref);

10243

++

10244

++	bfqd->queue = q;

10245

++

10246

++	spin_lock_irq(q->queue_lock);

10247

++	q->elevator = eq;

10248

++	spin_unlock_irq(q->queue_lock);

10249

++

10250

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

10251

++	if (bfqg == NULL) {

10252

++		kfree(bfqd);

10253

++		kobject_put(&eq->kobj);

10254

++		return -ENOMEM;

10255

++	}

10256

++

10257

++	bfqd->root_group = bfqg;

10258

++

10259

++	init_timer(&bfqd->idle_slice_timer);

10260

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

10261

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

10262

++

10263

++	bfqd->rq_pos_tree = RB_ROOT;

10264

++

10265

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

10266

++

10267

++	INIT_LIST_HEAD(&bfqd->active_list);

10268

++	INIT_LIST_HEAD(&bfqd->idle_list);

10269

++

10270

++	bfqd->hw_tag = -1;

10271

++

10272

++	bfqd->bfq_max_budget = bfq_default_max_budget;

10273

++

10274

++	bfqd->bfq_quantum = bfq_quantum;

10275

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

10276

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

10277

++	bfqd->bfq_back_max = bfq_back_max;

10278

++	bfqd->bfq_back_penalty = bfq_back_penalty;

10279

++	bfqd->bfq_slice_idle = bfq_slice_idle;

10280

++	bfqd->bfq_class_idle_last_service = 0;

10281

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

10282

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

10283

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

10284

++

10285

++	bfqd->low_latency = true;

10286

++

10287

++	bfqd->bfq_raising_coeff = 20;

10288

++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

10289

++	bfqd->bfq_raising_max_time = 0;

10290

++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

10291

++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

10292

++	bfqd->bfq_raising_max_softrt_rate = 7000; /*

10293

++						   * Approximate rate required

10294

++						   * to playback or record a

10295

++						   * high-definition compressed

10296

++						   * video.

10297

++						   */

10298

++	bfqd->raised_busy_queues = 0;

10299

++

10300

++	/* Initially estimate the device's peak rate as the reference rate */

10301

++	if (blk_queue_nonrot(bfqd->queue)) {

10302

++		bfqd->RT_prod = R_nonrot * T_nonrot;

10303

++		bfqd->peak_rate = R_nonrot;

10304

++	} else {

10305

++		bfqd->RT_prod = R_rot * T_rot;

10306

++		bfqd->peak_rate = R_rot;

10307

++	}

10308

++

10309

++	return 0;

10310

++}

10311

++

10312

++static void bfq_slab_kill(void)

10313

++{

10314

++	if (bfq_pool != NULL)

10315

++		kmem_cache_destroy(bfq_pool);

10316

++}

10317

++

10318

++static int __init bfq_slab_setup(void)

10319

++{

10320

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

10321

++	if (bfq_pool == NULL)

10322

++		return -ENOMEM;

10323

++	return 0;

10324

++}

10325

++

10326

++static ssize_t bfq_var_show(unsigned int var, char *page)

10327

++{

10328

++	return sprintf(page, "%d\n", var);

10329

++}

10330

++

10331

++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

10332

++{

10333

++	unsigned long new_val;

10334

++	int ret = kstrtoul(page, 10, &new_val);

10335

++

10336

++	if (ret == 0)

10337

++		*var = new_val;

10338

++

10339

++	return count;

10340

++}

10341

++

10342

++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

10343

++{

10344

++	struct bfq_data *bfqd = e->elevator_data;

10345

++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

10346

++		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

10347

++		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

10348

++}

10349

++

10350

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

10351

++{

10352

++	struct bfq_queue *bfqq;

10353

++	struct bfq_data *bfqd = e->elevator_data;

10354

++	ssize_t num_char = 0;

10355

++

10356

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

10357

++			    bfqd->queued);

10358

++

10359

++	spin_lock_irq(bfqd->queue->queue_lock);

10360

++

10361

++	num_char += sprintf(page + num_char, "Active:\n");

10362

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

10363

++	  num_char += sprintf(page + num_char,

10364

++			      "pid%d: weight %hu, nr_queued %d %d,"

10365

++			      " dur %d/%u\n",

10366

++			      bfqq->pid,

10367

++			      bfqq->entity.weight,

10368

++			      bfqq->queued[0],

10369

++			      bfqq->queued[1],

10370

++			jiffies_to_msecs(jiffies -

10371

++				bfqq->last_rais_start_finish),

10372

++			jiffies_to_msecs(bfqq->raising_cur_max_time));

10373

++	}

10374

++

10375

++	num_char += sprintf(page + num_char, "Idle:\n");

10376

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

10377

++			num_char += sprintf(page + num_char,

10378

++				"pid%d: weight %hu, dur %d/%u\n",

10379

++				bfqq->pid,

10380

++				bfqq->entity.weight,

10381

++				jiffies_to_msecs(jiffies -

10382

++					bfqq->last_rais_start_finish),

10383

++				jiffies_to_msecs(bfqq->raising_cur_max_time));

10384

++	}

10385

++

10386

++	spin_unlock_irq(bfqd->queue->queue_lock);

10387

++

10388

++	return num_char;

10389

++}

10390

++

10391

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

10392

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

10393

++{									\

10394

++	struct bfq_data *bfqd = e->elevator_data;			\

10395

++	unsigned int __data = __VAR;					\

10396

++	if (__CONV)							\

10397

++		__data = jiffies_to_msecs(__data);			\

10398

++	return bfq_var_show(__data, (page));				\

10399

++}

10400

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

10401

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

10402

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

10403

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

10404

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

10405

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

10406

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

10407

++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

10408

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

10409

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

10410

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

10411

++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

10412

++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

10413

++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

10414

++	1);

10415

++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

10416

++	bfqd->bfq_raising_min_inter_arr_async,

10417

++	1);

10418

++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

10419

++	bfqd->bfq_raising_max_softrt_rate, 0);

10420

++#undef SHOW_FUNCTION

10421

++

10422

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

10423

++static ssize_t								\

10424

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

10425

++{									\

10426

++	struct bfq_data *bfqd = e->elevator_data;			\

10427

++	unsigned long uninitialized_var(__data);			\

10428

++	int ret = bfq_var_store(&__data, (page), count);		\

10429

++	if (__data < (MIN))						\

10430

++		__data = (MIN);						\

10431

++	else if (__data > (MAX))					\

10432

++		__data = (MAX);						\

10433

++	if (__CONV)							\

10434

++		*(__PTR) = msecs_to_jiffies(__data);			\

10435

++	else								\

10436

++		*(__PTR) = __data;					\

10437

++	return ret;							\

10438

++}

10439

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

10440

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

10441

++		INT_MAX, 1);

10442

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

10443

++		INT_MAX, 1);

10444

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

10445

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

10446

++		INT_MAX, 0);

10447

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

10448

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

10449

++		1, INT_MAX, 0);

10450

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

10451

++		INT_MAX, 1);

10452

++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

10453

++		INT_MAX, 0);

10454

++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

10455

++		INT_MAX, 1);

10456

++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

10457

++		INT_MAX, 1);

10458

++STORE_FUNCTION(bfq_raising_min_idle_time_store,

10459

++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

10460

++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

10461

++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

10462

++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

10463

++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

10464

++#undef STORE_FUNCTION

10465

++

10466

++/* do nothing for the moment */

10467

++static ssize_t bfq_weights_store(struct elevator_queue *e,

10468

++				    const char *page, size_t count)

10469

++{

10470

++	return count;

10471

++}

10472

++

10473

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

10474

++{

10475

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

10476

++

10477

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

10478

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

10479

++	else

10480

++		return bfq_default_max_budget;

10481

++}

10482

++

10483

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

10484

++				    const char *page, size_t count)

10485

++{

10486

++	struct bfq_data *bfqd = e->elevator_data;

10487

++	unsigned long uninitialized_var(__data);

10488

++	int ret = bfq_var_store(&__data, (page), count);

10489

++

10490

++	if (__data == 0)

10491

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10492

++	else {

10493

++		if (__data > INT_MAX)

10494

++			__data = INT_MAX;

10495

++		bfqd->bfq_max_budget = __data;

10496

++	}

10497

++

10498

++	bfqd->bfq_user_max_budget = __data;

10499

++

10500

++	return ret;

10501

++}

10502

++

10503

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

10504

++				      const char *page, size_t count)

10505

++{

10506

++	struct bfq_data *bfqd = e->elevator_data;

10507

++	unsigned long uninitialized_var(__data);

10508

++	int ret = bfq_var_store(&__data, (page), count);

10509

++

10510

++	if (__data < 1)

10511

++		__data = 1;

10512

++	else if (__data > INT_MAX)

10513

++		__data = INT_MAX;

10514

++

10515

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

10516

++	if (bfqd->bfq_user_max_budget == 0)

10517

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10518

++

10519

++	return ret;

10520

++}

10521

++

10522

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

10523

++				     const char *page, size_t count)

10524

++{

10525

++	struct bfq_data *bfqd = e->elevator_data;

10526

++	unsigned long uninitialized_var(__data);

10527

++	int ret = bfq_var_store(&__data, (page), count);

10528

++

10529

++	if (__data > 1)

10530

++		__data = 1;

10531

++	if (__data == 0 && bfqd->low_latency != 0)

10532

++		bfq_end_raising(bfqd);

10533

++	bfqd->low_latency = __data;

10534

++

10535

++	return ret;

10536

++}

10537

++

10538

++#define BFQ_ATTR(name) \

10539

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

10540

++

10541

++static struct elv_fs_entry bfq_attrs[] = {

10542

++	BFQ_ATTR(quantum),

10543

++	BFQ_ATTR(fifo_expire_sync),

10544

++	BFQ_ATTR(fifo_expire_async),

10545

++	BFQ_ATTR(back_seek_max),

10546

++	BFQ_ATTR(back_seek_penalty),

10547

++	BFQ_ATTR(slice_idle),

10548

++	BFQ_ATTR(max_budget),

10549

++	BFQ_ATTR(max_budget_async_rq),

10550

++	BFQ_ATTR(timeout_sync),

10551

++	BFQ_ATTR(timeout_async),

10552

++	BFQ_ATTR(low_latency),

10553

++	BFQ_ATTR(raising_coeff),

10554

++	BFQ_ATTR(raising_max_time),

10555

++	BFQ_ATTR(raising_rt_max_time),

10556

++	BFQ_ATTR(raising_min_idle_time),

10557

++	BFQ_ATTR(raising_min_inter_arr_async),

10558

++	BFQ_ATTR(raising_max_softrt_rate),

10559

++	BFQ_ATTR(weights),

10560

++	__ATTR_NULL

10561

++};

10562

++

10563

++static struct elevator_type iosched_bfq = {

10564

++	.ops = {

10565

++		.elevator_merge_fn =		bfq_merge,

10566

++		.elevator_merged_fn =		bfq_merged_request,

10567

++		.elevator_merge_req_fn =	bfq_merged_requests,

10568

++		.elevator_allow_merge_fn =	bfq_allow_merge,

10569

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

10570

++		.elevator_add_req_fn =		bfq_insert_request,

10571

++		.elevator_activate_req_fn =	bfq_activate_request,

10572

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

10573

++		.elevator_completed_req_fn =	bfq_completed_request,

10574

++		.elevator_former_req_fn =	elv_rb_former_request,

10575

++		.elevator_latter_req_fn =	elv_rb_latter_request,

10576

++		.elevator_init_icq_fn =		bfq_init_icq,

10577

++		.elevator_exit_icq_fn =		bfq_exit_icq,

10578

++		.elevator_set_req_fn =		bfq_set_request,

10579

++		.elevator_put_req_fn =		bfq_put_request,

10580

++		.elevator_may_queue_fn =	bfq_may_queue,

10581

++		.elevator_init_fn =		bfq_init_queue,

10582

++		.elevator_exit_fn =		bfq_exit_queue,

10583

++	},

10584

++	.icq_size =		sizeof(struct bfq_io_cq),

10585

++	.icq_align =		__alignof__(struct bfq_io_cq),

10586

++	.elevator_attrs =	bfq_attrs,

10587

++	.elevator_name =	"bfq",

10588

++	.elevator_owner =	THIS_MODULE,

10589

++};

10590

++

10591

++static int __init bfq_init(void)

10592

++{

10593

++	/*

10594

++	 * Can be 0 on HZ < 1000 setups.

10595

++	 */

10596

++	if (bfq_slice_idle == 0)

10597

++		bfq_slice_idle = 1;

10598

++

10599

++	if (bfq_timeout_async == 0)

10600

++		bfq_timeout_async = 1;

10601

++

10602

++	if (bfq_slab_setup())

10603

++		return -ENOMEM;

10604

++

10605

++	elv_register(&iosched_bfq);

10606

++	printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");

10607

++

10608

++	return 0;

10609

++}

10610

++

10611

++static void __exit bfq_exit(void)

10612

++{

10613

++	elv_unregister(&iosched_bfq);

10614

++	bfq_slab_kill();

10615

++}

10616

++

10617

++module_init(bfq_init);

10618

++module_exit(bfq_exit);

10619

++

10620

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

10621

++MODULE_LICENSE("GPL");

10622

++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

10623

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

10624

+new file mode 100644

10625

+index 0000000..999b475

10626

+--- /dev/null

10627

++++ b/block/bfq-sched.c

10628

+@@ -0,0 +1,1078 @@

10629

++/*

10630

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

10631

++ *

10632

++ * Based on ideas and code from CFQ:

10633

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

10634

++ *

10635

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

10636

++ *		      Paolo Valente <paolo.valente@×××××××.it>

10637

++ *

10638

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

10639

++ */

10640

++

10641

++#ifdef CONFIG_CGROUP_BFQIO

10642

++#define for_each_entity(entity)	\

10643

++	for (; entity != NULL; entity = entity->parent)

10644

++

10645

++#define for_each_entity_safe(entity, parent) \

10646

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

10647

++

10648

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

10649

++						 int extract,

10650

++						 struct bfq_data *bfqd);

10651

++

10652

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

10653

++{

10654

++	struct bfq_entity *bfqg_entity;

10655

++	struct bfq_group *bfqg;

10656

++	struct bfq_sched_data *group_sd;

10657

++

10658

++	BUG_ON(next_in_service == NULL);

10659

++

10660

++	group_sd = next_in_service->sched_data;

10661

++

10662

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

10663

++	/*

10664

++	 * bfq_group's my_entity field is not NULL only if the group

10665

++	 * is not the root group. We must not touch the root entity

10666

++	 * as it must never become an in-service entity.

10667

++	 */

10668

++	bfqg_entity = bfqg->my_entity;

10669

++	if (bfqg_entity != NULL)

10670

++		bfqg_entity->budget = next_in_service->budget;

10671

++}

10672

++

10673

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

10674

++{

10675

++	struct bfq_entity *next_in_service;

10676

++

10677

++	if (sd->in_service_entity != NULL)

10678

++		/* will update/requeue at the end of service */

10679

++		return 0;

10680

++

10681

++	/*

10682

++	 * NOTE: this can be improved in many ways, such as returning

10683

++	 * 1 (and thus propagating upwards the update) only when the

10684

++	 * budget changes, or caching the bfqq that will be scheduled

10685

++	 * next from this subtree.  By now we worry more about

10686

++	 * correctness than about performance...

10687

++	 */

10688

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

10689

++	sd->next_in_service = next_in_service;

10690

++

10691

++	if (next_in_service != NULL)

10692

++		bfq_update_budget(next_in_service);

10693

++

10694

++	return 1;

10695

++}

10696

++

10697

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

10698

++					     struct bfq_entity *entity)

10699

++{

10700

++	BUG_ON(sd->next_in_service != entity);

10701

++}

10702

++#else

10703

++#define for_each_entity(entity)	\

10704

++	for (; entity != NULL; entity = NULL)

10705

++

10706

++#define for_each_entity_safe(entity, parent) \

10707

++	for (parent = NULL; entity != NULL; entity = parent)

10708

++

10709

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

10710

++{

10711

++	return 0;

10712

++}

10713

++

10714

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

10715

++					     struct bfq_entity *entity)

10716

++{

10717

++}

10718

++

10719

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

10720

++{

10721

++}

10722

++#endif

10723

++

10724

++/*

10725

++ * Shift for timestamp calculations.  This actually limits the maximum

10726

++ * service allowed in one timestamp delta (small shift values increase it),

10727

++ * the maximum total weight that can be used for the queues in the system

10728

++ * (big shift values increase it), and the period of virtual time wraparounds.

10729

++ */

10730

++#define WFQ_SERVICE_SHIFT	22

10731

++

10732

++/**

10733

++ * bfq_gt - compare two timestamps.

10734

++ * @a: first ts.

10735

++ * @b: second ts.

10736

++ *

10737

++ * Return @a > @b, dealing with wrapping correctly.

10738

++ */

10739

++static inline int bfq_gt(u64 a, u64 b)

10740

++{

10741

++	return (s64)(a - b) > 0;

10742

++}

10743

++

10744

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

10745

++{

10746

++	struct bfq_queue *bfqq = NULL;

10747

++

10748

++	BUG_ON(entity == NULL);

10749

++

10750

++	if (entity->my_sched_data == NULL)

10751

++		bfqq = container_of(entity, struct bfq_queue, entity);

10752

++

10753

++	return bfqq;

10754

++}

10755

++

10756

++

10757

++/**

10758

++ * bfq_delta - map service into the virtual time domain.

10759

++ * @service: amount of service.

10760

++ * @weight: scale factor (weight of an entity or weight sum).

10761

++ */

10762

++static inline u64 bfq_delta(unsigned long service,

10763

++					unsigned long weight)

10764

++{

10765

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

10766

++

10767

++	do_div(d, weight);

10768

++	return d;

10769

++}

10770

++

10771

++/**

10772

++ * bfq_calc_finish - assign the finish time to an entity.

10773

++ * @entity: the entity to act upon.

10774

++ * @service: the service to be charged to the entity.

10775

++ */

10776

++static inline void bfq_calc_finish(struct bfq_entity *entity,

10777

++				   unsigned long service)

10778

++{

10779

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10780

++

10781

++	BUG_ON(entity->weight == 0);

10782

++

10783

++	entity->finish = entity->start +

10784

++		bfq_delta(service, entity->weight);

10785

++

10786

++	if (bfqq != NULL) {

10787

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

10788

++			"calc_finish: serv %lu, w %d",

10789

++			service, entity->weight);

10790

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

10791

++			"calc_finish: start %llu, finish %llu, delta %llu",

10792

++			entity->start, entity->finish,

10793

++			bfq_delta(service, entity->weight));

10794

++	}

10795

++}

10796

++

10797

++/**

10798

++ * bfq_entity_of - get an entity from a node.

10799

++ * @node: the node field of the entity.

10800

++ *

10801

++ * Convert a node pointer to the relative entity.  This is used only

10802

++ * to simplify the logic of some functions and not as the generic

10803

++ * conversion mechanism because, e.g., in the tree walking functions,

10804

++ * the check for a %NULL value would be redundant.

10805

++ */

10806

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

10807

++{

10808

++	struct bfq_entity *entity = NULL;

10809

++

10810

++	if (node != NULL)

10811

++		entity = rb_entry(node, struct bfq_entity, rb_node);

10812

++

10813

++	return entity;

10814

++}

10815

++

10816

++/**

10817

++ * bfq_extract - remove an entity from a tree.

10818

++ * @root: the tree root.

10819

++ * @entity: the entity to remove.

10820

++ */

10821

++static inline void bfq_extract(struct rb_root *root,

10822

++			       struct bfq_entity *entity)

10823

++{

10824

++	BUG_ON(entity->tree != root);

10825

++

10826

++	entity->tree = NULL;

10827

++	rb_erase(&entity->rb_node, root);

10828

++}

10829

++

10830

++/**

10831

++ * bfq_idle_extract - extract an entity from the idle tree.

10832

++ * @st: the service tree of the owning @entity.

10833

++ * @entity: the entity being removed.

10834

++ */

10835

++static void bfq_idle_extract(struct bfq_service_tree *st,

10836

++			     struct bfq_entity *entity)

10837

++{

10838

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10839

++	struct rb_node *next;

10840

++

10841

++	BUG_ON(entity->tree != &st->idle);

10842

++

10843

++	if (entity == st->first_idle) {

10844

++		next = rb_next(&entity->rb_node);

10845

++		st->first_idle = bfq_entity_of(next);

10846

++	}

10847

++

10848

++	if (entity == st->last_idle) {

10849

++		next = rb_prev(&entity->rb_node);

10850

++		st->last_idle = bfq_entity_of(next);

10851

++	}

10852

++

10853

++	bfq_extract(&st->idle, entity);

10854

++

10855

++	if (bfqq != NULL)

10856

++		list_del(&bfqq->bfqq_list);

10857

++}

10858

++

10859

++/**

10860

++ * bfq_insert - generic tree insertion.

10861

++ * @root: tree root.

10862

++ * @entity: entity to insert.

10863

++ *

10864

++ * This is used for the idle and the active tree, since they are both

10865

++ * ordered by finish time.

10866

++ */

10867

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

10868

++{

10869

++	struct bfq_entity *entry;

10870

++	struct rb_node **node = &root->rb_node;

10871

++	struct rb_node *parent = NULL;

10872

++

10873

++	BUG_ON(entity->tree != NULL);

10874

++

10875

++	while (*node != NULL) {

10876

++		parent = *node;

10877

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

10878

++

10879

++		if (bfq_gt(entry->finish, entity->finish))

10880

++			node = &parent->rb_left;

10881

++		else

10882

++			node = &parent->rb_right;

10883

++	}

10884

++

10885

++	rb_link_node(&entity->rb_node, parent, node);

10886

++	rb_insert_color(&entity->rb_node, root);

10887

++

10888

++	entity->tree = root;

10889

++}

10890

++

10891

++/**

10892

++ * bfq_update_min - update the min_start field of a entity.

10893

++ * @entity: the entity to update.

10894

++ * @node: one of its children.

10895

++ *

10896

++ * This function is called when @entity may store an invalid value for

10897

++ * min_start due to updates to the active tree.  The function  assumes

10898

++ * that the subtree rooted at @node (which may be its left or its right

10899

++ * child) has a valid min_start value.

10900

++ */

10901

++static inline void bfq_update_min(struct bfq_entity *entity,

10902

++				  struct rb_node *node)

10903

++{

10904

++	struct bfq_entity *child;

10905

++

10906

++	if (node != NULL) {

10907

++		child = rb_entry(node, struct bfq_entity, rb_node);

10908

++		if (bfq_gt(entity->min_start, child->min_start))

10909

++			entity->min_start = child->min_start;

10910

++	}

10911

++}

10912

++

10913

++/**

10914

++ * bfq_update_active_node - recalculate min_start.

10915

++ * @node: the node to update.

10916

++ *

10917

++ * @node may have changed position or one of its children may have moved,

10918

++ * this function updates its min_start value.  The left and right subtrees

10919

++ * are assumed to hold a correct min_start value.

10920

++ */

10921

++static inline void bfq_update_active_node(struct rb_node *node)

10922

++{

10923

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

10924

++

10925

++	entity->min_start = entity->start;

10926

++	bfq_update_min(entity, node->rb_right);

10927

++	bfq_update_min(entity, node->rb_left);

10928

++}

10929

++

10930

++/**

10931

++ * bfq_update_active_tree - update min_start for the whole active tree.

10932

++ * @node: the starting node.

10933

++ *

10934

++ * @node must be the deepest modified node after an update.  This function

10935

++ * updates its min_start using the values held by its children, assuming

10936

++ * that they did not change, and then updates all the nodes that may have

10937

++ * changed in the path to the root.  The only nodes that may have changed

10938

++ * are the ones in the path or their siblings.

10939

++ */

10940

++static void bfq_update_active_tree(struct rb_node *node)

10941

++{

10942

++	struct rb_node *parent;

10943

++

10944

++up:

10945

++	bfq_update_active_node(node);

10946

++

10947

++	parent = rb_parent(node);

10948

++	if (parent == NULL)

10949

++		return;

10950

++

10951

++	if (node == parent->rb_left && parent->rb_right != NULL)

10952

++		bfq_update_active_node(parent->rb_right);

10953

++	else if (parent->rb_left != NULL)

10954

++		bfq_update_active_node(parent->rb_left);

10955

++

10956

++	node = parent;

10957

++	goto up;

10958

++}

10959

++

10960

++/**

10961

++ * bfq_active_insert - insert an entity in the active tree of its group/device.

10962

++ * @st: the service tree of the entity.

10963

++ * @entity: the entity being inserted.

10964

++ *

10965

++ * The active tree is ordered by finish time, but an extra key is kept

10966

++ * per each node, containing the minimum value for the start times of

10967

++ * its children (and the node itself), so it's possible to search for

10968

++ * the eligible node with the lowest finish time in logarithmic time.

10969

++ */

10970

++static void bfq_active_insert(struct bfq_service_tree *st,

10971

++			      struct bfq_entity *entity)

10972

++{

10973

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10974

++	struct rb_node *node = &entity->rb_node;

10975

++

10976

++	bfq_insert(&st->active, entity);

10977

++

10978

++	if (node->rb_left != NULL)

10979

++		node = node->rb_left;

10980

++	else if (node->rb_right != NULL)

10981

++		node = node->rb_right;

10982

++

10983

++	bfq_update_active_tree(node);

10984

++

10985

++	if (bfqq != NULL)

10986

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

10987

++}

10988

++

10989

++/**

10990

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

10991

++ * @ioprio: the ioprio value to convert.

10992

++ */

10993

++static unsigned short bfq_ioprio_to_weight(int ioprio)

10994

++{

10995

++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

10996

++	return IOPRIO_BE_NR - ioprio;

10997

++}

10998

++

10999

++/**

11000

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

11001

++ * @weight: the weight value to convert.

11002

++ *

11003

++ * To preserve as mush as possible the old only-ioprio user interface,

11004

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

11005

++ * larger than IOPRIO_BE_NR

11006

++ */

11007

++static unsigned short bfq_weight_to_ioprio(int weight)

11008

++{

11009

++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

11010

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

11011

++}

11012

++

11013

++static inline void bfq_get_entity(struct bfq_entity *entity)

11014

++{

11015

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11016

++	struct bfq_sched_data *sd;

11017

++

11018

++	if (bfqq != NULL) {

11019

++		sd = entity->sched_data;

11020

++		atomic_inc(&bfqq->ref);

11021

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

11022

++			     bfqq, atomic_read(&bfqq->ref));

11023

++	}

11024

++}

11025

++

11026

++/**

11027

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

11028

++ * @node: the node being removed.

11029

++ *

11030

++ * Do the first step of an extraction in an rb tree, looking for the

11031

++ * node that will replace @node, and returning the deepest node that

11032

++ * the following modifications to the tree can touch.  If @node is the

11033

++ * last node in the tree return %NULL.

11034

++ */

11035

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

11036

++{

11037

++	struct rb_node *deepest;

11038

++

11039

++	if (node->rb_right == NULL && node->rb_left == NULL)

11040

++		deepest = rb_parent(node);

11041

++	else if (node->rb_right == NULL)

11042

++		deepest = node->rb_left;

11043

++	else if (node->rb_left == NULL)

11044

++		deepest = node->rb_right;

11045

++	else {

11046

++		deepest = rb_next(node);

11047

++		if (deepest->rb_right != NULL)

11048

++			deepest = deepest->rb_right;

11049

++		else if (rb_parent(deepest) != node)

11050

++			deepest = rb_parent(deepest);

11051

++	}

11052

++

11053

++	return deepest;

11054

++}

11055

++

11056

++/**

11057

++ * bfq_active_extract - remove an entity from the active tree.

11058

++ * @st: the service_tree containing the tree.

11059

++ * @entity: the entity being removed.

11060

++ */

11061

++static void bfq_active_extract(struct bfq_service_tree *st,

11062

++			       struct bfq_entity *entity)

11063

++{

11064

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11065

++	struct rb_node *node;

11066

++

11067

++	node = bfq_find_deepest(&entity->rb_node);

11068

++	bfq_extract(&st->active, entity);

11069

++

11070

++	if (node != NULL)

11071

++		bfq_update_active_tree(node);

11072

++

11073

++	if (bfqq != NULL)

11074

++		list_del(&bfqq->bfqq_list);

11075

++}

11076

++

11077

++/**

11078

++ * bfq_idle_insert - insert an entity into the idle tree.

11079

++ * @st: the service tree containing the tree.

11080

++ * @entity: the entity to insert.

11081

++ */

11082

++static void bfq_idle_insert(struct bfq_service_tree *st,

11083

++			    struct bfq_entity *entity)

11084

++{

11085

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11086

++	struct bfq_entity *first_idle = st->first_idle;

11087

++	struct bfq_entity *last_idle = st->last_idle;

11088

++

11089

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

11090

++		st->first_idle = entity;

11091

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

11092

++		st->last_idle = entity;

11093

++

11094

++	bfq_insert(&st->idle, entity);

11095

++

11096

++	if (bfqq != NULL)

11097

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

11098

++}

11099

++

11100

++/**

11101

++ * bfq_forget_entity - remove an entity from the wfq trees.

11102

++ * @st: the service tree.

11103

++ * @entity: the entity being removed.

11104

++ *

11105

++ * Update the device status and forget everything about @entity, putting

11106

++ * the device reference to it, if it is a queue.  Entities belonging to

11107

++ * groups are not refcounted.

11108

++ */

11109

++static void bfq_forget_entity(struct bfq_service_tree *st,

11110

++			      struct bfq_entity *entity)

11111

++{

11112

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11113

++	struct bfq_sched_data *sd;

11114

++

11115

++	BUG_ON(!entity->on_st);

11116

++

11117

++	entity->on_st = 0;

11118

++	st->wsum -= entity->weight;

11119

++	if (bfqq != NULL) {

11120

++		sd = entity->sched_data;

11121

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

11122

++			     bfqq, atomic_read(&bfqq->ref));

11123

++		bfq_put_queue(bfqq);

11124

++	}

11125

++}

11126

++

11127

++/**

11128

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

11129

++ * @st: service tree for the entity.

11130

++ * @entity: the entity being released.

11131

++ */

11132

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

11133

++				struct bfq_entity *entity)

11134

++{

11135

++	bfq_idle_extract(st, entity);

11136

++	bfq_forget_entity(st, entity);

11137

++}

11138

++

11139

++/**

11140

++ * bfq_forget_idle - update the idle tree if necessary.

11141

++ * @st: the service tree to act upon.

11142

++ *

11143

++ * To preserve the global O(log N) complexity we only remove one entry here;

11144

++ * as the idle tree will not grow indefinitely this can be done safely.

11145

++ */

11146

++static void bfq_forget_idle(struct bfq_service_tree *st)

11147

++{

11148

++	struct bfq_entity *first_idle = st->first_idle;

11149

++	struct bfq_entity *last_idle = st->last_idle;

11150

++

11151

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

11152

++	    !bfq_gt(last_idle->finish, st->vtime)) {

11153

++		/*

11154

++		 * Forget the whole idle tree, increasing the vtime past

11155

++		 * the last finish time of idle entities.

11156

++		 */

11157

++		st->vtime = last_idle->finish;

11158

++	}

11159

++

11160

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

11161

++		bfq_put_idle_entity(st, first_idle);

11162

++}

11163

++

11164

++static struct bfq_service_tree *

11165

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

11166

++			 struct bfq_entity *entity)

11167

++{

11168

++	struct bfq_service_tree *new_st = old_st;

11169

++

11170

++	if (entity->ioprio_changed) {

11171

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11172

++

11173

++		BUG_ON(old_st->wsum < entity->weight);

11174

++		old_st->wsum -= entity->weight;

11175

++

11176

++		if (entity->new_weight != entity->orig_weight) {

11177

++			entity->orig_weight = entity->new_weight;

11178

++			entity->ioprio =

11179

++				bfq_weight_to_ioprio(entity->orig_weight);

11180

++		} else if (entity->new_ioprio != entity->ioprio) {

11181

++			entity->ioprio = entity->new_ioprio;

11182

++			entity->orig_weight =

11183

++					bfq_ioprio_to_weight(entity->ioprio);

11184

++		} else

11185

++			entity->new_weight = entity->orig_weight =

11186

++				bfq_ioprio_to_weight(entity->ioprio);

11187

++

11188

++		entity->ioprio_class = entity->new_ioprio_class;

11189

++		entity->ioprio_changed = 0;

11190

++

11191

++		/*

11192

++		 * NOTE: here we may be changing the weight too early,

11193

++		 * this will cause unfairness.  The correct approach

11194

++		 * would have required additional complexity to defer

11195

++		 * weight changes to the proper time instants (i.e.,

11196

++		 * when entity->finish <= old_st->vtime).

11197

++		 */

11198

++		new_st = bfq_entity_service_tree(entity);

11199

++		entity->weight = entity->orig_weight *

11200

++			(bfqq != NULL ? bfqq->raising_coeff : 1);

11201

++		new_st->wsum += entity->weight;

11202

++

11203

++		if (new_st != old_st)

11204

++			entity->start = new_st->vtime;

11205

++	}

11206

++

11207

++	return new_st;

11208

++}

11209

++

11210

++/**

11211

++ * bfq_bfqq_served - update the scheduler status after selection for service.

11212

++ * @bfqq: the queue being served.

11213

++ * @served: bytes to transfer.

11214

++ *

11215

++ * NOTE: this can be optimized, as the timestamps of upper level entities

11216

++ * are synchronized every time a new bfqq is selected for service.  By now,

11217

++ * we keep it to better check consistency.

11218

++ */

11219

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

11220

++{

11221

++	struct bfq_entity *entity = &bfqq->entity;

11222

++	struct bfq_service_tree *st;

11223

++

11224

++	for_each_entity(entity) {

11225

++		st = bfq_entity_service_tree(entity);

11226

++

11227

++		entity->service += served;

11228

++		BUG_ON(entity->service > entity->budget);

11229

++		BUG_ON(st->wsum == 0);

11230

++

11231

++		st->vtime += bfq_delta(served, st->wsum);

11232

++		bfq_forget_idle(st);

11233

++	}

11234

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

11235

++}

11236

++

11237

++/**

11238

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

11239

++ * @bfqq: the queue that needs a service update.

11240

++ *

11241

++ * When it's not possible to be fair in the service domain, because

11242

++ * a queue is not consuming its budget fast enough (the meaning of

11243

++ * fast depends on the timeout parameter), we charge it a full

11244

++ * budget.  In this way we should obtain a sort of time-domain

11245

++ * fairness among all the seeky/slow queues.

11246

++ */

11247

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

11248

++{

11249

++	struct bfq_entity *entity = &bfqq->entity;

11250

++

11251

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

11252

++

11253

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

11254

++}

11255

++

11256

++/**

11257

++ * __bfq_activate_entity - activate an entity.

11258

++ * @entity: the entity being activated.

11259

++ *

11260

++ * Called whenever an entity is activated, i.e., it is not active and one

11261

++ * of its children receives a new request, or has to be reactivated due to

11262

++ * budget exhaustion.  It uses the current budget of the entity (and the

11263

++ * service received if @entity is active) of the queue to calculate its

11264

++ * timestamps.

11265

++ */

11266

++static void __bfq_activate_entity(struct bfq_entity *entity)

11267

++{

11268

++	struct bfq_sched_data *sd = entity->sched_data;

11269

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11270

++

11271

++	if (entity == sd->in_service_entity) {

11272

++		BUG_ON(entity->tree != NULL);

11273

++		/*

11274

++		 * If we are requeueing the current entity we have

11275

++		 * to take care of not charging to it service it has

11276

++		 * not received.

11277

++		 */

11278

++		bfq_calc_finish(entity, entity->service);

11279

++		entity->start = entity->finish;

11280

++		sd->in_service_entity = NULL;

11281

++	} else if (entity->tree == &st->active) {

11282

++		/*

11283

++		 * Requeueing an entity due to a change of some

11284

++		 * next_in_service entity below it.  We reuse the

11285

++		 * old start time.

11286

++		 */

11287

++		bfq_active_extract(st, entity);

11288

++	} else if (entity->tree == &st->idle) {

11289

++		/*

11290

++		 * Must be on the idle tree, bfq_idle_extract() will

11291

++		 * check for that.

11292

++		 */

11293

++		bfq_idle_extract(st, entity);

11294

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

11295

++				       st->vtime : entity->finish;

11296

++	} else {

11297

++		/*

11298

++		 * The finish time of the entity may be invalid, and

11299

++		 * it is in the past for sure, otherwise the queue

11300

++		 * would have been on the idle tree.

11301

++		 */

11302

++		entity->start = st->vtime;

11303

++		st->wsum += entity->weight;

11304

++		bfq_get_entity(entity);

11305

++

11306

++		BUG_ON(entity->on_st);

11307

++		entity->on_st = 1;

11308

++	}

11309

++

11310

++	st = __bfq_entity_update_weight_prio(st, entity);

11311

++	bfq_calc_finish(entity, entity->budget);

11312

++	bfq_active_insert(st, entity);

11313

++}

11314

++

11315

++/**

11316

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

11317

++ * @entity: the entity to activate.

11318

++ *

11319

++ * Activate @entity and all the entities on the path from it to the root.

11320

++ */

11321

++static void bfq_activate_entity(struct bfq_entity *entity)

11322

++{

11323

++	struct bfq_sched_data *sd;

11324

++

11325

++	for_each_entity(entity) {

11326

++		__bfq_activate_entity(entity);

11327

++

11328

++		sd = entity->sched_data;

11329

++		if (!bfq_update_next_in_service(sd))

11330

++			/*

11331

++			 * No need to propagate the activation to the

11332

++			 * upper entities, as they will be updated when

11333

++			 * the in-service entity is rescheduled.

11334

++			 */

11335

++			break;

11336

++	}

11337

++}

11338

++

11339

++/**

11340

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

11341

++ * @entity: the entity to deactivate.

11342

++ * @requeue: if false, the entity will not be put into the idle tree.

11343

++ *

11344

++ * Deactivate an entity, independently from its previous state.  If the

11345

++ * entity was not on a service tree just return, otherwise if it is on

11346

++ * any scheduler tree, extract it from that tree, and if necessary

11347

++ * and if the caller did not specify @requeue, put it on the idle tree.

11348

++ *

11349

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

11350

++ * if the entity was under service or if it was the next_in_service for

11351

++ * its sched_data; return %0 otherwise.

11352

++ */

11353

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11354

++{

11355

++	struct bfq_sched_data *sd = entity->sched_data;

11356

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11357

++	int was_in_service = entity == sd->in_service_entity;

11358

++	int ret = 0;

11359

++

11360

++	if (!entity->on_st)

11361

++		return 0;

11362

++

11363

++	BUG_ON(was_in_service && entity->tree != NULL);

11364

++

11365

++	if (was_in_service) {

11366

++		bfq_calc_finish(entity, entity->service);

11367

++		sd->in_service_entity = NULL;

11368

++	} else if (entity->tree == &st->active)

11369

++		bfq_active_extract(st, entity);

11370

++	else if (entity->tree == &st->idle)

11371

++		bfq_idle_extract(st, entity);

11372

++	else if (entity->tree != NULL)

11373

++		BUG();

11374

++

11375

++	if (was_in_service || sd->next_in_service == entity)

11376

++		ret = bfq_update_next_in_service(sd);

11377

++

11378

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

11379

++		bfq_forget_entity(st, entity);

11380

++	else

11381

++		bfq_idle_insert(st, entity);

11382

++

11383

++	BUG_ON(sd->in_service_entity == entity);

11384

++	BUG_ON(sd->next_in_service == entity);

11385

++

11386

++	return ret;

11387

++}

11388

++

11389

++/**

11390

++ * bfq_deactivate_entity - deactivate an entity.

11391

++ * @entity: the entity to deactivate.

11392

++ * @requeue: true if the entity can be put on the idle tree

11393

++ */

11394

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11395

++{

11396

++	struct bfq_sched_data *sd;

11397

++	struct bfq_entity *parent;

11398

++

11399

++	for_each_entity_safe(entity, parent) {

11400

++		sd = entity->sched_data;

11401

++

11402

++		if (!__bfq_deactivate_entity(entity, requeue))

11403

++			/*

11404

++			 * The parent entity is still backlogged, and

11405

++			 * we don't need to update it as it is still

11406

++			 * under service.

11407

++			 */

11408

++			break;

11409

++

11410

++		if (sd->next_in_service != NULL)

11411

++			/*

11412

++			 * The parent entity is still backlogged and

11413

++			 * the budgets on the path towards the root

11414

++			 * need to be updated.

11415

++			 */

11416

++			goto update;

11417

++

11418

++		/*

11419

++		 * If we reach there the parent is no more backlogged and

11420

++		 * we want to propagate the dequeue upwards.

11421

++		 */

11422

++		requeue = 1;

11423

++	}

11424

++

11425

++	return;

11426

++

11427

++update:

11428

++	entity = parent;

11429

++	for_each_entity(entity) {

11430

++		__bfq_activate_entity(entity);

11431

++

11432

++		sd = entity->sched_data;

11433

++		if (!bfq_update_next_in_service(sd))

11434

++			break;

11435

++	}

11436

++}

11437

++

11438

++/**

11439

++ * bfq_update_vtime - update vtime if necessary.

11440

++ * @st: the service tree to act upon.

11441

++ *

11442

++ * If necessary update the service tree vtime to have at least one

11443

++ * eligible entity, skipping to its start time.  Assumes that the

11444

++ * active tree of the device is not empty.

11445

++ *

11446

++ * NOTE: this hierarchical implementation updates vtimes quite often,

11447

++ * we may end up with reactivated tasks getting timestamps after a

11448

++ * vtime skip done because we needed a ->first_active entity on some

11449

++ * intermediate node.

11450

++ */

11451

++static void bfq_update_vtime(struct bfq_service_tree *st)

11452

++{

11453

++	struct bfq_entity *entry;

11454

++	struct rb_node *node = st->active.rb_node;

11455

++

11456

++	entry = rb_entry(node, struct bfq_entity, rb_node);

11457

++	if (bfq_gt(entry->min_start, st->vtime)) {

11458

++		st->vtime = entry->min_start;

11459

++		bfq_forget_idle(st);

11460

++	}

11461

++}

11462

++

11463

++/**

11464

++ * bfq_first_active_entity - find the eligible entity with

11465

++ *                           the smallest finish time

11466

++ * @st: the service tree to select from.

11467

++ *

11468

++ * This function searches the first schedulable entity, starting from the

11469

++ * root of the tree and going on the left every time on this side there is

11470

++ * a subtree with at least one eligible (start >= vtime) entity.  The path

11471

++ * on the right is followed only if a) the left subtree contains no eligible

11472

++ * entities and b) no eligible entity has been found yet.

11473

++ */

11474

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

11475

++{

11476

++	struct bfq_entity *entry, *first = NULL;

11477

++	struct rb_node *node = st->active.rb_node;

11478

++

11479

++	while (node != NULL) {

11480

++		entry = rb_entry(node, struct bfq_entity, rb_node);

11481

++left:

11482

++		if (!bfq_gt(entry->start, st->vtime))

11483

++			first = entry;

11484

++

11485

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

11486

++

11487

++		if (node->rb_left != NULL) {

11488

++			entry = rb_entry(node->rb_left,

11489

++					 struct bfq_entity, rb_node);

11490

++			if (!bfq_gt(entry->min_start, st->vtime)) {

11491

++				node = node->rb_left;

11492

++				goto left;

11493

++			}

11494

++		}

11495

++		if (first != NULL)

11496

++			break;

11497

++		node = node->rb_right;

11498

++	}

11499

++

11500

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

11501

++	return first;

11502

++}

11503

++

11504

++/**

11505

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

11506

++ * @st: the service tree.

11507

++ *

11508

++ * Update the virtual time in @st and return the first eligible entity

11509

++ * it contains.

11510

++ */

11511

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

11512

++						   bool force)

11513

++{

11514

++	struct bfq_entity *entity, *new_next_in_service = NULL;

11515

++

11516

++	if (RB_EMPTY_ROOT(&st->active))

11517

++		return NULL;

11518

++

11519

++	bfq_update_vtime(st);

11520

++	entity = bfq_first_active_entity(st);

11521

++	BUG_ON(bfq_gt(entity->start, st->vtime));

11522

++

11523

++	/*

11524

++	 * If the chosen entity does not match with the sched_data's

11525

++	 * next_in_service and we are forcedly serving the IDLE priority

11526

++	 * class tree, bubble up budget update.

11527

++	 */

11528

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

11529

++		new_next_in_service = entity;

11530

++		for_each_entity(new_next_in_service)

11531

++			bfq_update_budget(new_next_in_service);

11532

++	}

11533

++

11534

++	return entity;

11535

++}

11536

++

11537

++/**

11538

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

11539

++ * @sd: the sched_data.

11540

++ * @extract: if true the returned entity will be also extracted from @sd.

11541

++ *

11542

++ * NOTE: since we cache the next_in_service entity at each level of the

11543

++ * hierarchy, the complexity of the lookup can be decreased with

11544

++ * absolutely no effort just returning the cached next_in_service value;

11545

++ * we prefer to do full lookups to test the consistency of * the data

11546

++ * structures.

11547

++ */

11548

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

11549

++						 int extract,

11550

++						 struct bfq_data *bfqd)

11551

++{

11552

++	struct bfq_service_tree *st = sd->service_tree;

11553

++	struct bfq_entity *entity;

11554

++	int i = 0;

11555

++

11556

++	BUG_ON(sd->in_service_entity != NULL);

11557

++

11558

++	if (bfqd != NULL &&

11559

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

11560

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

11561

++						  true);

11562

++		if (entity != NULL) {

11563

++			i = BFQ_IOPRIO_CLASSES - 1;

11564

++			bfqd->bfq_class_idle_last_service = jiffies;

11565

++			sd->next_in_service = entity;

11566

++		}

11567

++	}

11568

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

11569

++		entity = __bfq_lookup_next_entity(st + i, false);

11570

++		if (entity != NULL) {

11571

++			if (extract) {

11572

++				bfq_check_next_in_service(sd, entity);

11573

++				bfq_active_extract(st + i, entity);

11574

++				sd->in_service_entity = entity;

11575

++				sd->next_in_service = NULL;

11576

++			}

11577

++			break;

11578

++		}

11579

++	}

11580

++

11581

++	return entity;

11582

++}

11583

++

11584

++/*

11585

++ * Get next queue for service.

11586

++ */

11587

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

11588

++{

11589

++	struct bfq_entity *entity = NULL;

11590

++	struct bfq_sched_data *sd;

11591

++	struct bfq_queue *bfqq;

11592

++

11593

++	BUG_ON(bfqd->in_service_queue != NULL);

11594

++

11595

++	if (bfqd->busy_queues == 0)

11596

++		return NULL;

11597

++

11598

++	sd = &bfqd->root_group->sched_data;

11599

++	for (; sd != NULL; sd = entity->my_sched_data) {

11600

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

11601

++		BUG_ON(entity == NULL);

11602

++		entity->service = 0;

11603

++	}

11604

++

11605

++	bfqq = bfq_entity_to_bfqq(entity);

11606

++	BUG_ON(bfqq == NULL);

11607

++

11608

++	return bfqq;

11609

++}

11610

++

11611

++/*

11612

++ * Forced extraction of the given queue.

11613

++ */

11614

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

11615

++				      struct bfq_queue *bfqq)

11616

++{

11617

++	struct bfq_entity *entity;

11618

++	struct bfq_sched_data *sd;

11619

++

11620

++	BUG_ON(bfqd->in_service_queue != NULL);

11621

++

11622

++	entity = &bfqq->entity;

11623

++	/*

11624

++	 * Bubble up extraction/update from the leaf to the root.

11625

++	*/

11626

++	for_each_entity(entity) {

11627

++		sd = entity->sched_data;

11628

++		bfq_update_budget(entity);

11629

++		bfq_update_vtime(bfq_entity_service_tree(entity));

11630

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

11631

++		sd->active_entity = entity;

11632

++		sd->next_active = NULL;

11633

++		entity->service = 0;

11634

++	}

11635

++

11636

++	return;

11637

++}

11638

++

11639

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

11640

++{

11641

++	if (bfqd->in_service_bic != NULL) {

11642

++		put_io_context(bfqd->in_service_bic->icq.ioc);

11643

++		bfqd->in_service_bic = NULL;

11644

++	}

11645

++

11646

++	bfqd->in_service_queue = NULL;

11647

++	del_timer(&bfqd->idle_slice_timer);

11648

++}

11649

++

11650

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11651

++				int requeue)

11652

++{

11653

++	struct bfq_entity *entity = &bfqq->entity;

11654

++

11655

++	if (bfqq == bfqd->in_service_queue)

11656

++		__bfq_bfqd_reset_in_service(bfqd);

11657

++

11658

++	bfq_deactivate_entity(entity, requeue);

11659

++}

11660

++

11661

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11662

++{

11663

++	struct bfq_entity *entity = &bfqq->entity;

11664

++

11665

++	bfq_activate_entity(entity);

11666

++}

11667

++

11668

++/*

11669

++ * Called when the bfqq no longer has requests pending, remove it from

11670

++ * the service tree.

11671

++ */

11672

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11673

++			      int requeue)

11674

++{

11675

++	BUG_ON(!bfq_bfqq_busy(bfqq));

11676

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

11677

++

11678

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

11679

++

11680

++	bfq_clear_bfqq_busy(bfqq);

11681

++

11682

++	BUG_ON(bfqd->busy_queues == 0);

11683

++	bfqd->busy_queues--;

11684

++	if (bfqq->raising_coeff > 1)

11685

++		bfqd->raised_busy_queues--;

11686

++

11687

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

11688

++}

11689

++

11690

++/*

11691

++ * Called when an inactive queue receives a new request.

11692

++ */

11693

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11694

++{

11695

++	BUG_ON(bfq_bfqq_busy(bfqq));

11696

++	BUG_ON(bfqq == bfqd->in_service_queue);

11697

++

11698

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

11699

++

11700

++	bfq_activate_bfqq(bfqd, bfqq);

11701

++

11702

++	bfq_mark_bfqq_busy(bfqq);

11703

++	bfqd->busy_queues++;

11704

++	if (bfqq->raising_coeff > 1)

11705

++		bfqd->raised_busy_queues++;

11706

++}

11707

+diff --git a/block/bfq.h b/block/bfq.h

11708

+new file mode 100644

11709

+index 0000000..f9b5881

11710

+--- /dev/null

11711

++++ b/block/bfq.h

11712

+@@ -0,0 +1,614 @@

11713

++/*

11714

++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.

11715

++ *

11716

++ * Based on ideas and code from CFQ:

11717

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

11718

++ *

11719

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

11720

++ *		      Paolo Valente <paolo.valente@×××××××.it>

11721

++ *

11722

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

11723

++ */

11724

++

11725

++#ifndef _BFQ_H

11726

++#define _BFQ_H

11727

++

11728

++#include <linux/blktrace_api.h>

11729

++#include <linux/hrtimer.h>

11730

++#include <linux/ioprio.h>

11731

++#include <linux/rbtree.h>

11732

++

11733

++#define BFQ_IOPRIO_CLASSES	3

11734

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

11735

++

11736

++#define BFQ_MIN_WEIGHT	1

11737

++#define BFQ_MAX_WEIGHT	1000

11738

++

11739

++#define BFQ_DEFAULT_GRP_WEIGHT	10

11740

++#define BFQ_DEFAULT_GRP_IOPRIO	0

11741

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

11742

++

11743

++struct bfq_entity;

11744

++

11745

++/**

11746

++ * struct bfq_service_tree - per ioprio_class service tree.

11747

++ * @active: tree for active entities (i.e., those backlogged).

11748

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

11749

++ * @first_idle: idle entity with minimum F_i.

11750

++ * @last_idle: idle entity with maximum F_i.

11751

++ * @vtime: scheduler virtual time.

11752

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

11753

++ *

11754

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

11755

++ * ioprio_class has its own independent scheduler, and so its own

11756

++ * bfq_service_tree.  All the fields are protected by the queue lock

11757

++ * of the containing bfqd.

11758

++ */

11759

++struct bfq_service_tree {

11760

++	struct rb_root active;

11761

++	struct rb_root idle;

11762

++

11763

++	struct bfq_entity *first_idle;

11764

++	struct bfq_entity *last_idle;

11765

++

11766

++	u64 vtime;

11767

++	unsigned long wsum;

11768

++};

11769

++

11770

++/**

11771

++ * struct bfq_sched_data - multi-class scheduler.

11772

++ * @in_service_entity: entity under service.

11773

++ * @next_in_service: head-of-the-line entity in the scheduler.

11774

++ * @service_tree: array of service trees, one per ioprio_class.

11775

++ *

11776

++ * bfq_sched_data is the basic scheduler queue.  It supports three

11777

++ * ioprio_classes, and can be used either as a toplevel queue or as

11778

++ * an intermediate queue on a hierarchical setup.

11779

++ * @next_in_service points to the active entity of the sched_data

11780

++ * service trees that will be scheduled next.

11781

++ *

11782

++ * The supported ioprio_classes are the same as in CFQ, in descending

11783

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

11784

++ * Requests from higher priority queues are served before all the

11785

++ * requests from lower priority queues; among requests of the same

11786

++ * queue requests are served according to B-WF2Q+.

11787

++ * All the fields are protected by the queue lock of the containing bfqd.

11788

++ */

11789

++struct bfq_sched_data {

11790

++	struct bfq_entity *in_service_entity;

11791

++	struct bfq_entity *next_in_service;

11792

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

11793

++};

11794

++

11795

++/**

11796

++ * struct bfq_entity - schedulable entity.

11797

++ * @rb_node: service_tree member.

11798

++ * @on_st: flag, true if the entity is on a tree (either the active or

11799

++ *         the idle one of its service_tree).

11800

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

11801

++ * @start: B-WF2Q+ start timestamp (aka S_i).

11802

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

11803

++ * @min_start: minimum start time of the (active) subtree rooted at

11804

++ *             this entity; used for O(log N) lookups into active trees.

11805

++ * @service: service received during the last round of service.

11806

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

11807

++ * @weight: weight of the queue

11808

++ * @parent: parent entity, for hierarchical scheduling.

11809

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

11810

++ *                 associated scheduler queue, %NULL on leaf nodes.

11811

++ * @sched_data: the scheduler queue this entity belongs to.

11812

++ * @ioprio: the ioprio in use.

11813

++ * @new_weight: when a weight change is requested, the new weight value.

11814

++ * @orig_weight: original weight, used to implement weight boosting

11815

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

11816

++ * @ioprio_class: the ioprio_class in use.

11817

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

11818

++ *                    ioprio_class value.

11819

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

11820

++ *                  ioprio_class change.

11821

++ *

11822

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

11823

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

11824

++ * entity belongs to the sched_data of the parent group in the cgroup

11825

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

11826

++ * in @my_sched_data.

11827

++ *

11828

++ * Each entity stores independently its priority values; this would

11829

++ * allow different weights on different devices, but this

11830

++ * functionality is not exported to userspace by now.  Priorities and

11831

++ * weights are updated lazily, first storing the new values into the

11832

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

11833

++ * there is a transition in the entity state that allows the priority

11834

++ * update to take place the effective and the requested priority

11835

++ * values are synchronized.

11836

++ *

11837

++ * Unless cgroups are used, the weight value is calculated from the

11838

++ * ioprio to export the same interface as CFQ.  When dealing with

11839

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

11840

++ * time to consume their budget and have true sequential behavior, and

11841

++ * when there are no external factors breaking anticipation) the

11842

++ * relative weights at each level of the cgroups hierarchy should be

11843

++ * guaranteed.  All the fields are protected by the queue lock of the

11844

++ * containing bfqd.

11845

++ */

11846

++struct bfq_entity {

11847

++	struct rb_node rb_node;

11848

++

11849

++	int on_st;

11850

++

11851

++	u64 finish;

11852

++	u64 start;

11853

++

11854

++	struct rb_root *tree;

11855

++

11856

++	u64 min_start;

11857

++

11858

++	unsigned long service, budget;

11859

++	unsigned short weight, new_weight;

11860

++	unsigned short orig_weight;

11861

++

11862

++	struct bfq_entity *parent;

11863

++

11864

++	struct bfq_sched_data *my_sched_data;

11865

++	struct bfq_sched_data *sched_data;

11866

++

11867

++	unsigned short ioprio, new_ioprio;

11868

++	unsigned short ioprio_class, new_ioprio_class;

11869

++

11870

++	int ioprio_changed;

11871

++};

11872

++

11873

++struct bfq_group;

11874

++

11875

++/**

11876

++ * struct bfq_queue - leaf schedulable entity.

11877

++ * @ref: reference counter.

11878

++ * @bfqd: parent bfq_data.

11879

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

11880

++ *           one or more other queues.

11881

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

11882

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

11883

++ * @sort_list: sorted list of pending requests.

11884

++ * @next_rq: if fifo isn't expired, next request to serve.

11885

++ * @queued: nr of requests queued in @sort_list.

11886

++ * @allocated: currently allocated requests.

11887

++ * @meta_pending: pending metadata requests.

11888

++ * @fifo: fifo list of requests in sort_list.

11889

++ * @entity: entity representing this queue in the scheduler.

11890

++ * @max_budget: maximum budget allowed from the feedback mechanism.

11891

++ * @budget_timeout: budget expiration (in jiffies).

11892

++ * @dispatched: number of requests on the dispatch list or inside driver.

11893

++ * @org_ioprio: saved ioprio during boosted periods.

11894

++ * @flags: status flags.

11895

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

11896

++ * @seek_samples: number of seeks sampled

11897

++ * @seek_total: sum of the distances of the seeks sampled

11898

++ * @seek_mean: mean seek distance

11899

++ * @last_request_pos: position of the last request enqueued

11900

++ * @pid: pid of the process owning the queue, used for logging purposes.

11901

++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

11902

++ * @raising_cur_max_time: current max raising time for this queue

11903

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

11904

++ *                      idle to backlogged

11905

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

11906

++ *                           since the last transition from idle to backlogged

11907

++ *

11908

++ * A bfq_queue is a leaf request queue; it can be associated to an io_context

11909

++ * or more (if it is an async one).  @cgroup holds a reference to the

11910

++ * cgroup, to be sure that it does not disappear while a bfqq still

11911

++ * references it (mostly to avoid races between request issuing and task

11912

++ * migration followed by cgroup distruction).

11913

++ * All the fields are protected by the queue lock of the containing bfqd.

11914

++ */

11915

++struct bfq_queue {

11916

++	atomic_t ref;

11917

++	struct bfq_data *bfqd;

11918

++

11919

++	/* fields for cooperating queues handling */

11920

++	struct bfq_queue *new_bfqq;

11921

++	struct rb_node pos_node;

11922

++	struct rb_root *pos_root;

11923

++

11924

++	struct rb_root sort_list;

11925

++	struct request *next_rq;

11926

++	int queued[2];

11927

++	int allocated[2];

11928

++	int meta_pending;

11929

++	struct list_head fifo;

11930

++

11931

++	struct bfq_entity entity;

11932

++

11933

++	unsigned long max_budget;

11934

++	unsigned long budget_timeout;

11935

++

11936

++	int dispatched;

11937

++

11938

++	unsigned short org_ioprio;

11939

++

11940

++	unsigned int flags;

11941

++

11942

++	struct list_head bfqq_list;

11943

++

11944

++	unsigned int seek_samples;

11945

++	u64 seek_total;

11946

++	sector_t seek_mean;

11947

++	sector_t last_request_pos;

11948

++

11949

++	pid_t pid;

11950

++

11951

++	/* weight-raising fields */

11952

++	unsigned long raising_cur_max_time;

11953

++	unsigned long soft_rt_next_start;

11954

++	unsigned long last_rais_start_finish;

11955

++	unsigned int raising_coeff;

11956

++	unsigned long last_idle_bklogged;

11957

++	unsigned long service_from_backlogged;

11958

++};

11959

++

11960

++/**

11961

++ * struct bfq_ttime - per process thinktime stats.

11962

++ * @ttime_total: total process thinktime

11963

++ * @ttime_samples: number of thinktime samples

11964

++ * @ttime_mean: average process thinktime

11965

++ */

11966

++struct bfq_ttime {

11967

++	unsigned long last_end_request;

11968

++

11969

++	unsigned long ttime_total;

11970

++	unsigned long ttime_samples;

11971

++	unsigned long ttime_mean;

11972

++};

11973

++

11974

++/**

11975

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

11976

++ * @icq: associated io_cq structure

11977

++ * @bfqq: array of two process queues, the sync and the async

11978

++ * @ttime: associated @bfq_ttime struct

11979

++ */

11980

++struct bfq_io_cq {

11981

++	struct io_cq icq; /* must be the first member */

11982

++	struct bfq_queue *bfqq[2];

11983

++	struct bfq_ttime ttime;

11984

++	int ioprio;

11985

++};

11986

++

11987

++/**

11988

++ * struct bfq_data - per device data structure.

11989

++ * @queue: request queue for the managed device.

11990

++ * @root_group: root bfq_group for the device.

11991

++ * @rq_pos_tree: rbtree sorted by next_request position,

11992

++ *		used when determining if two or more queues

11993

++ *		have interleaving requests (see bfq_close_cooperator).

11994

++ * @busy_queues: number of bfq_queues containing requests (including the

11995

++ *		 queue under service, even if it is idling).

11996

++ * @raised_busy_queues: number of weight-raised busy bfq_queues.

11997

++ * @queued: number of queued requests.

11998

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

11999

++ * @sync_flight: number of sync requests in the driver.

12000

++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

12001

++ *		      completed requests .

12002

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

12003

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

12004

++ * @budgets_assigned: number of budgets assigned.

12005

++ * @idle_slice_timer: timer set when idling for the next sequential request

12006

++ *                    from the queue under service.

12007

++ * @unplug_work: delayed work to restart dispatching on the request queue.

12008

++ * @in_service_queue: bfq_queue under service.

12009

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

12010

++ * @last_position: on-disk position of the last served request.

12011

++ * @last_budget_start: beginning of the last budget.

12012

++ * @last_idling_start: beginning of the last idle slice.

12013

++ * @peak_rate: peak transfer rate observed for a budget.

12014

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

12015

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

12016

++ * @group_list: list of all the bfq_groups active on the device.

12017

++ * @active_list: list of all the bfq_queues active on the device.

12018

++ * @idle_list: list of all the bfq_queues idle on the device.

12019

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

12020

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

12021

++ *                   requests are served in fifo order.

12022

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

12023

++ * @bfq_back_max: maximum allowed backward seek.

12024

++ * @bfq_slice_idle: maximum idling time.

12025

++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

12026

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

12027

++ *                           async queues.

12028

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

12029

++ *               to prevent seeky queues to impose long latencies to well

12030

++ *               behaved ones (this also implies that seeky queues cannot

12031

++ *               receive guarantees in the service domain; after a timeout

12032

++ *               they are charged for the whole allocated budget, to try

12033

++ *               to preserve a behavior reasonably fair among them, but

12034

++ *               without service-domain guarantees).

12035

++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

12036

++ *                            queue is multiplied

12037

++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

12038

++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

12039

++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

12040

++ *			       may be reactivated for a queue (in jiffies)

12041

++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

12042

++ *				     after which weight-raising may be

12043

++ *				     reactivated for an already busy queue

12044

++ *				     (in jiffies)

12045

++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

12046

++ *			         sectors per seconds

12047

++ * @RT_prod: cached value of the product R*T used for computing the maximum

12048

++ *	     duration of the weight raising automatically

12049

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

12050

++ *

12051

++ * All the fields are protected by the @queue lock.

12052

++ */

12053

++struct bfq_data {

12054

++	struct request_queue *queue;

12055

++

12056

++	struct bfq_group *root_group;

12057

++

12058

++	struct rb_root rq_pos_tree;

12059

++

12060

++	int busy_queues;

12061

++	int raised_busy_queues;

12062

++	int queued;

12063

++	int rq_in_driver;

12064

++	int sync_flight;

12065

++

12066

++	int max_rq_in_driver;

12067

++	int hw_tag_samples;

12068

++	int hw_tag;

12069

++

12070

++	int budgets_assigned;

12071

++

12072

++	struct timer_list idle_slice_timer;

12073

++	struct work_struct unplug_work;

12074

++

12075

++	struct bfq_queue *in_service_queue;

12076

++	struct bfq_io_cq *in_service_bic;

12077

++

12078

++	sector_t last_position;

12079

++

12080

++	ktime_t last_budget_start;

12081

++	ktime_t last_idling_start;

12082

++	int peak_rate_samples;

12083

++	u64 peak_rate;

12084

++	unsigned long bfq_max_budget;

12085

++

12086

++	struct hlist_head group_list;

12087

++	struct list_head active_list;

12088

++	struct list_head idle_list;

12089

++

12090

++	unsigned int bfq_quantum;

12091

++	unsigned int bfq_fifo_expire[2];

12092

++	unsigned int bfq_back_penalty;

12093

++	unsigned int bfq_back_max;

12094

++	unsigned int bfq_slice_idle;

12095

++	u64 bfq_class_idle_last_service;

12096

++

12097

++	unsigned int bfq_user_max_budget;

12098

++	unsigned int bfq_max_budget_async_rq;

12099

++	unsigned int bfq_timeout[2];

12100

++

12101

++	bool low_latency;

12102

++

12103

++	/* parameters of the low_latency heuristics */

12104

++	unsigned int bfq_raising_coeff;

12105

++	unsigned int bfq_raising_max_time;

12106

++	unsigned int bfq_raising_rt_max_time;

12107

++	unsigned int bfq_raising_min_idle_time;

12108

++	unsigned long bfq_raising_min_inter_arr_async;

12109

++	unsigned int bfq_raising_max_softrt_rate;

12110

++	u64 RT_prod;

12111

++

12112

++	struct bfq_queue oom_bfqq;

12113

++};

12114

++

12115

++enum bfqq_state_flags {

12116

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

12117

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

12118

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

12119

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

12120

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

12121

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

12122

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

12123

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

12124

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

12125

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

12126

++	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

12127

++};

12128

++

12129

++#define BFQ_BFQQ_FNS(name)						\

12130

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

12131

++{									\

12132

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

12133

++}									\

12134

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

12135

++{									\

12136

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

12137

++}									\

12138

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

12139

++{									\

12140

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

12141

++}

12142

++

12143

++BFQ_BFQQ_FNS(busy);

12144

++BFQ_BFQQ_FNS(wait_request);

12145

++BFQ_BFQQ_FNS(must_alloc);

12146

++BFQ_BFQQ_FNS(fifo_expire);

12147

++BFQ_BFQQ_FNS(idle_window);

12148

++BFQ_BFQQ_FNS(prio_changed);

12149

++BFQ_BFQQ_FNS(sync);

12150

++BFQ_BFQQ_FNS(budget_new);

12151

++BFQ_BFQQ_FNS(coop);

12152

++BFQ_BFQQ_FNS(split_coop);

12153

++BFQ_BFQQ_FNS(softrt_update);

12154

++#undef BFQ_BFQQ_FNS

12155

++

12156

++/* Logging facilities. */

12157

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

12158

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

12159

++

12160

++#define bfq_log(bfqd, fmt, args...) \

12161

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

12162

++

12163

++/* Expiration reasons. */

12164

++enum bfqq_expiration {

12165

++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

12166

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

12167

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

12168

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

12169

++};

12170

++

12171

++#ifdef CONFIG_CGROUP_BFQIO

12172

++/**

12173

++ * struct bfq_group - per (device, cgroup) data structure.

12174

++ * @entity: schedulable entity to insert into the parent group sched_data.

12175

++ * @sched_data: own sched_data, to contain child entities (they may be

12176

++ *              both bfq_queues and bfq_groups).

12177

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

12178

++ *              list of the containing cgroup's bfqio_cgroup.

12179

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

12180

++ *             of the groups active on the same device; used for cleanup.

12181

++ * @bfqd: the bfq_data for the device this group acts upon.

12182

++ * @async_bfqq: array of async queues for all the tasks belonging to

12183

++ *              the group, one queue per ioprio value per ioprio_class,

12184

++ *              except for the idle class that has only one queue.

12185

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

12186

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

12187

++ *             to avoid too many special cases during group creation/migration.

12188

++ *

12189

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

12190

++ * there is a set of bfq_groups, each one collecting the lower-level

12191

++ * entities belonging to the group that are acting on the same device.

12192

++ *

12193

++ * Locking works as follows:

12194

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

12195

++ *      via RCU from its readers.

12196

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

12197

++ *      from the readers.

12198

++ *    o All the other fields are protected by the @bfqd queue lock.

12199

++ */

12200

++struct bfq_group {

12201

++	struct bfq_entity entity;

12202

++	struct bfq_sched_data sched_data;

12203

++

12204

++	struct hlist_node group_node;

12205

++	struct hlist_node bfqd_node;

12206

++

12207

++	void *bfqd;

12208

++

12209

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12210

++	struct bfq_queue *async_idle_bfqq;

12211

++

12212

++	struct bfq_entity *my_entity;

12213

++};

12214

++

12215

++/**

12216

++ * struct bfqio_cgroup - bfq cgroup data structure.

12217

++ * @css: subsystem state for bfq in the containing cgroup.

12218

++ * @online: flag marked when the subsystem is inserted.

12219

++ * @weight: cgroup weight.

12220

++ * @ioprio: cgroup ioprio.

12221

++ * @ioprio_class: cgroup ioprio_class.

12222

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

12223

++ * @group_data: list containing the bfq_group belonging to this cgroup.

12224

++ *

12225

++ * @group_data is accessed using RCU, with @lock protecting the updates,

12226

++ * @ioprio and @ioprio_class are protected by @lock.

12227

++ */

12228

++struct bfqio_cgroup {

12229

++	struct cgroup_subsys_state css;

12230

++	bool online;

12231

++

12232

++	unsigned short weight, ioprio, ioprio_class;

12233

++

12234

++	spinlock_t lock;

12235

++	struct hlist_head group_data;

12236

++};

12237

++#else

12238

++struct bfq_group {

12239

++	struct bfq_sched_data sched_data;

12240

++

12241

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12242

++	struct bfq_queue *async_idle_bfqq;

12243

++};

12244

++#endif

12245

++

12246

++static inline struct bfq_service_tree *

12247

++bfq_entity_service_tree(struct bfq_entity *entity)

12248

++{

12249

++	struct bfq_sched_data *sched_data = entity->sched_data;

12250

++	unsigned int idx = entity->ioprio_class - 1;

12251

++

12252

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

12253

++	BUG_ON(sched_data == NULL);

12254

++

12255

++	return sched_data->service_tree + idx;

12256

++}

12257

++

12258

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

12259

++					    int is_sync)

12260

++{

12261

++	return bic->bfqq[!!is_sync];

12262

++}

12263

++

12264

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

12265

++				struct bfq_queue *bfqq, int is_sync)

12266

++{

12267

++	bic->bfqq[!!is_sync] = bfqq;

12268

++}

12269

++

12270

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

12271

++{

12272

++	return bic->icq.q->elevator->elevator_data;

12273

++}

12274

++

12275

++/**

12276

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

12277

++ * @ptr: a pointer to a bfqd.

12278

++ * @flags: storage for the flags to be saved.

12279

++ *

12280

++ * This function allows bfqg->bfqd to be protected by the

12281

++ * queue lock of the bfqd they reference; the pointer is dereferenced

12282

++ * under RCU, so the storage for bfqd is assured to be safe as long

12283

++ * as the RCU read side critical section does not end.  After the

12284

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

12285

++ * sure that no other writer accessed it.  If we raced with a writer,

12286

++ * the function returns NULL, with the queue unlocked, otherwise it

12287

++ * returns the dereferenced pointer, with the queue locked.

12288

++ */

12289

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

12290

++						   unsigned long *flags)

12291

++{

12292

++	struct bfq_data *bfqd;

12293

++

12294

++	rcu_read_lock();

12295

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

12296

++

12297

++	if (bfqd != NULL) {

12298

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

12299

++		if (*ptr == bfqd)

12300

++			goto out;

12301

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12302

++	}

12303

++

12304

++	bfqd = NULL;

12305

++out:

12306

++	rcu_read_unlock();

12307

++	return bfqd;

12308

++}

12309

++

12310

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

12311

++				       unsigned long *flags)

12312

++{

12313

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12314

++}

12315

++

12316

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

12317

++static void bfq_put_queue(struct bfq_queue *bfqq);

12318

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

12319

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

12320

++				       struct bfq_group *bfqg, int is_sync,

12321

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

12322

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

12323

++					 struct bfq_group *bfqg);

12324

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

12325

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

12326

++#endif

12327

+--

12328

+1.8.5.2

12329

+

12330

12331

Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

12332

===================================================================

12333

--- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	2014-02-07 14:46:59 UTC (rev 2665)

12334

+++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	2014-02-07 15:42:35 UTC (rev 2666)

12335

@@ -1,1034 +0,0 @@

12336

-From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001

12337

-From: Mauro Andreolini <mauro.andreolini@×××××××.it>

12338

-Date: Thu, 23 Jan 2014 16:54:44 +0100

12339

-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for

12340

- 3.13.0

12341

-

12342

-A set of processes may happen  to  perform interleaved reads, i.e., requests

12343

-whose union would give rise to a  sequential read  pattern.  There are two

12344

-typical  cases: in the first  case,   processes  read  fixed-size chunks of

12345

-data at a fixed distance from each other, while in the second case processes

12346

-may read variable-size chunks at  variable distances. The latter case occurs

12347

-for  example with  KVM, which  splits the  I/O generated  by the  guest into

12348

-multiple chunks,  and lets these chunks  be served by a  pool of cooperating

12349

-processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

12350

-available  process. CFQ  uses actual  queue merging  for the  first type  of

12351

-rocesses, whereas it  uses preemption to get a sequential  read pattern out

12352

-of the read requests  performed by the second type of  processes. In the end

12353

-it uses  two different  mechanisms to  achieve the  same goal: boosting the

12354

-throughput with interleaved I/O.

12355

-

12356

-This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

12357

-sequential  read pattern  with both  types of  processes. The  main idea  is

12358

-checking newly arrived requests against the next request of the active queue

12359

-both in case of actual request insert and in case of request merge. By doing

12360

-so, both the types of processes can be handled by just merging their queues.

12361

-EQM is  then simpler and  more compact than the  pair of mechanisms used in

12362

-CFQ.

12363

-

12364

-Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

12365

-properly restoring the weight-raising state of  a queue when it gets back to

12366

-a non-merged state.

12367

-

12368

-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

12369

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

12370

-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

12371

----

12372

- block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

12373

- block/bfq-sched.c   |  28 ---

12374

- block/bfq.h         |  16 ++

12375

- 3 files changed, 474 insertions(+), 227 deletions(-)

12376

-

12377

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

12378

-index 7670400..295236e 100644

12379

---- a/block/bfq-iosched.c

12380

-+++ b/block/bfq-iosched.c

12381

-@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

12382

- 	return dur;

12383

- }

12384

-

12385

-+static inline void

12386

-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

12387

-+{

12388

-+	if (bic->saved_idle_window)

12389

-+		bfq_mark_bfqq_idle_window(bfqq);

12390

-+	else

12391

-+		bfq_clear_bfqq_idle_window(bfqq);

12392

-+	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

12393

-+		/*

12394

-+		 * Start a weight raising period with the duration given by

12395

-+		 * the raising_time_left snapshot.

12396

-+		 */

12397

-+		if (bfq_bfqq_busy(bfqq))

12398

-+			bfqq->bfqd->raised_busy_queues++;

12399

-+		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

12400

-+		bfqq->raising_cur_max_time = bic->raising_time_left;

12401

-+		bfqq->last_rais_start_finish = jiffies;

12402

-+		bfqq->entity.ioprio_changed = 1;

12403

-+	}

12404

-+	/*

12405

-+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

12406

-+	 * getting confused about the queue's need of a weight-raising

12407

-+	 * period.

12408

-+	 */

12409

-+	bic->raising_time_left = 0;

12410

-+}

12411

-+

12412

-+/*

12413

-+ * Must be called with the queue_lock held.

12414

-+ */

12415

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

12416

-+{

12417

-+	int process_refs, io_refs;

12418

-+

12419

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12420

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12421

-+	BUG_ON(process_refs < 0);

12422

-+	return process_refs;

12423

-+}

12424

-+

12425

- static void bfq_add_rq_rb(struct request *rq)

12426

- {

12427

- 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

12428

-@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

12429

- 		if (!bfqd->low_latency)

12430

- 			goto add_bfqq_busy;

12431

-

12432

-+		if (bfq_bfqq_just_split(bfqq))

12433

-+			goto set_ioprio_changed;

12434

-+

12435

- 		/*

12436

--		 * If the queue is not being boosted and has been idle

12437

--		 * for enough time, start a weight-raising period

12438

-+		 * If the queue:

12439

-+		 * - is not being boosted,

12440

-+		 * - has been idle for enough time,

12441

-+		 * - is not a sync queue or is linked to a bfq_io_cq (it is

12442

-+		 *   shared "for its nature" or it is not shared and its

12443

-+		 *   requests have not been redirected to a shared queue)

12444

-+		 * start a weight-raising period.

12445

- 		 */

12446

--		if (old_raising_coeff == 1 &&

12447

--		    (idle_for_long_time || soft_rt)) {

12448

-+		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

12449

-+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

12450

- 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

12451

- 			if (idle_for_long_time)

12452

- 				bfqq->raising_cur_max_time =

12453

-@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)

12454

- 					bfqd->bfq_raising_rt_max_time;

12455

- 			}

12456

- 		}

12457

-+set_ioprio_changed:

12458

- 		if (old_raising_coeff != bfqq->raising_coeff)

12459

- 			entity->ioprio_changed = 1;

12460

- add_bfqq_busy:

12461

-@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

12462

- 	spin_unlock_irq(bfqd->queue->queue_lock);

12463

- }

12464

-

12465

--static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12466

--			   struct bio *bio)

12467

--{

12468

--	struct bfq_data *bfqd = q->elevator->elevator_data;

12469

--	struct bfq_io_cq *bic;

12470

--	struct bfq_queue *bfqq;

12471

--

12472

--	/*

12473

--	 * Disallow merge of a sync bio into an async request.

12474

--	 */

12475

--	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12476

--		return 0;

12477

--

12478

--	/*

12479

--	 * Lookup the bfqq that this bio will be queued with. Allow

12480

--	 * merge only if rq is queued there.

12481

--	 * Queue lock is held here.

12482

--	 */

12483

--	bic = bfq_bic_lookup(bfqd, current->io_context);

12484

--	if (bic == NULL)

12485

--		return 0;

12486

--

12487

--	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12488

--	return bfqq == RQ_BFQQ(rq);

12489

--}

12490

--

12491

--static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

12492

--				       struct bfq_queue *bfqq)

12493

--{

12494

--	if (bfqq != NULL) {

12495

--		bfq_mark_bfqq_must_alloc(bfqq);

12496

--		bfq_mark_bfqq_budget_new(bfqq);

12497

--		bfq_clear_bfqq_fifo_expire(bfqq);

12498

--

12499

--		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12500

--

12501

--		bfq_log_bfqq(bfqd, bfqq,

12502

--			     "set_in_service_queue, cur-budget = %lu",

12503

--			     bfqq->entity.budget);

12504

--	}

12505

--

12506

--	bfqd->in_service_queue = bfqq;

12507

--}

12508

--

12509

--/*

12510

-- * Get and set a new queue for service.

12511

-- */

12512

--static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

12513

--						  struct bfq_queue *bfqq)

12514

-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

12515

- {

12516

--	if (!bfqq)

12517

--		bfqq = bfq_get_next_queue(bfqd);

12518

-+	if (request)

12519

-+		return blk_rq_pos(io_struct);

12520

- 	else

12521

--		bfq_get_next_queue_forced(bfqd, bfqq);

12522

--

12523

--	__bfq_set_in_service_queue(bfqd, bfqq);

12524

--	return bfqq;

12525

-+		return ((struct bio *)io_struct)->bi_sector;

12526

- }

12527

-

12528

--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

12529

--					  struct request *rq)

12530

-+static inline sector_t bfq_dist_from(sector_t pos1,

12531

-+				     sector_t pos2)

12532

- {

12533

--	if (blk_rq_pos(rq) >= bfqd->last_position)

12534

--		return blk_rq_pos(rq) - bfqd->last_position;

12535

-+	if (pos1 >= pos2)

12536

-+		return pos1 - pos2;

12537

- 	else

12538

--		return bfqd->last_position - blk_rq_pos(rq);

12539

-+		return pos2 - pos1;

12540

- }

12541

-

12542

--/*

12543

-- * Return true if bfqq has no request pending and rq is close enough to

12544

-- * bfqd->last_position, or if rq is closer to bfqd->last_position than

12545

-- * bfqq->next_rq

12546

-- */

12547

--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

12548

-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

12549

-+					 sector_t sector)

12550

- {

12551

--	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

12552

-+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

12553

-+	       BFQQ_SEEK_THR;

12554

- }

12555

-

12556

--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12557

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

12558

- {

12559

- 	struct rb_root *root = &bfqd->rq_pos_tree;

12560

- 	struct rb_node *parent, *node;

12561

- 	struct bfq_queue *__bfqq;

12562

--	sector_t sector = bfqd->last_position;

12563

-

12564

- 	if (RB_EMPTY_ROOT(root))

12565

- 		return NULL;

12566

-@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12567

- 	 * position).

12568

- 	 */

12569

- 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

12570

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12571

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12572

- 		return __bfqq;

12573

-

12574

- 	if (blk_rq_pos(__bfqq->next_rq) < sector)

12575

-@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12576

- 		return NULL;

12577

-

12578

- 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

12579

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12580

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12581

- 		return __bfqq;

12582

-

12583

- 	return NULL;

12584

-@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12585

- /*

12586

-  * bfqd - obvious

12587

-  * cur_bfqq - passed in so that we don't decide that the current queue

12588

-- *            is closely cooperating with itself.

12589

-- *

12590

-- * We are assuming that cur_bfqq has dispatched at least one request,

12591

-- * and that bfqd->last_position reflects a position on the disk associated

12592

-- * with the I/O issued by cur_bfqq.

12593

-+ *            is closely cooperating with itself

12594

-+ * sector - used as a reference point to search for a close queue

12595

-  */

12596

- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12597

--					      struct bfq_queue *cur_bfqq)

12598

-+					      struct bfq_queue *cur_bfqq,

12599

-+					      sector_t sector)

12600

- {

12601

- 	struct bfq_queue *bfqq;

12602

-

12603

-@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12604

- 	 * working closely on the same area of the disk. In that case,

12605

- 	 * we can group them together and don't waste time idling.

12606

- 	 */

12607

--	bfqq = bfqq_close(bfqd);

12608

-+	bfqq = bfqq_close(bfqd, sector);

12609

- 	if (bfqq == NULL || bfqq == cur_bfqq)

12610

- 		return NULL;

12611

-

12612

-@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12613

- 	return bfqq;

12614

- }

12615

-

12616

-+static struct bfq_queue *

12617

-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12618

-+{

12619

-+	int process_refs, new_process_refs;

12620

-+	struct bfq_queue *__bfqq;

12621

-+

12622

-+	/*

12623

-+	 * If there are no process references on the new_bfqq, then it is

12624

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12625

-+	 * may have dropped their last reference (not just their last process

12626

-+	 * reference).

12627

-+	 */

12628

-+	if (!bfqq_process_refs(new_bfqq))

12629

-+		return NULL;

12630

-+

12631

-+	/* Avoid a circular list and skip interim queue merges. */

12632

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

12633

-+		if (__bfqq == bfqq)

12634

-+			return NULL;

12635

-+		new_bfqq = __bfqq;

12636

-+	}

12637

-+

12638

-+	process_refs = bfqq_process_refs(bfqq);

12639

-+	new_process_refs = bfqq_process_refs(new_bfqq);

12640

-+	/*

12641

-+	 * If the process for the bfqq has gone away, there is no

12642

-+	 * sense in merging the queues.

12643

-+	 */

12644

-+	if (process_refs == 0 || new_process_refs == 0)

12645

-+		return NULL;

12646

-+

12647

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12648

-+		new_bfqq->pid);

12649

-+

12650

-+	/*

12651

-+	 * Merging is just a redirection: the requests of the process owning

12652

-+	 * one of the two queues are redirected to the other queue. The latter

12653

-+	 * queue, in its turn, is set as shared if this is the first time that

12654

-+	 * the requests of some process are redirected to it.

12655

-+	 *

12656

-+	 * We redirect bfqq to new_bfqq and not the opposite, because we

12657

-+	 * are in the context of the process owning bfqq, hence we have the

12658

-+	 * io_cq of this process. So we can immediately configure this io_cq

12659

-+	 * to redirect the requests of the process to new_bfqq.

12660

-+	 *

12661

-+	 * NOTE, even if new_bfqq coincides with the in-service queue, the

12662

-+	 * io_cq of new_bfqq is not available, because, if the in-service queue

12663

-+	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

12664

-+	 * in-service queue.

12665

-+	 * Redirecting the requests of the process owning bfqq to the currently

12666

-+	 * in-service queue is in any case the best option, as we feed the

12667

-+	 * in-service queue with new requests close to the last request served

12668

-+	 * and, by doing so, hopefully increase the throughput.

12669

-+	 */

12670

-+	bfqq->new_bfqq = new_bfqq;

12671

-+	atomic_add(process_refs, &new_bfqq->ref);

12672

-+	return new_bfqq;

12673

-+}

12674

-+

12675

-+/*

12676

-+ * Attempt to schedule a merge of bfqq with the currently in-service queue or

12677

-+ * with a close queue among the scheduled queues.

12678

-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

12679

-+ * structure otherwise.

12680

-+ */

12681

-+static struct bfq_queue *

12682

-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12683

-+		     void *io_struct, bool request)

12684

-+{

12685

-+	struct bfq_queue *in_service_bfqq, *new_bfqq;

12686

-+

12687

-+	if (bfqq->new_bfqq)

12688

-+		return bfqq->new_bfqq;

12689

-+

12690

-+	if (!io_struct)

12691

-+		return NULL;

12692

-+

12693

-+	in_service_bfqq = bfqd->in_service_queue;

12694

-+

12695

-+	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

12696

-+	    !bfqd->in_service_bic)

12697

-+		goto check_scheduled;

12698

-+

12699

-+	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

12700

-+		goto check_scheduled;

12701

-+

12702

-+	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

12703

-+		goto check_scheduled;

12704

-+

12705

-+	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

12706

-+		goto check_scheduled;

12707

-+

12708

-+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

12709

-+	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

12710

-+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

12711

-+		if (new_bfqq != NULL)

12712

-+			return new_bfqq; /* Merge with the in-service queue */

12713

-+	}

12714

-+

12715

-+	/*

12716

-+	 * Check whether there is a cooperator among currently scheduled

12717

-+	 * queues. The only thing we need is that the bio/request is not

12718

-+	 * NULL, as we need it to establish whether a cooperator exists.

12719

-+	 */

12720

-+check_scheduled:

12721

-+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

12722

-+					bfq_io_struct_pos(io_struct, request));

12723

-+	if (new_bfqq)

12724

-+		return bfq_setup_merge(bfqq, new_bfqq);

12725

-+

12726

-+	return NULL;

12727

-+}

12728

-+

12729

-+static inline void

12730

-+bfq_bfqq_save_state(struct bfq_queue *bfqq)

12731

-+{

12732

-+	/*

12733

-+	 * If bfqq->bic == NULL, the queue is already shared or its requests

12734

-+	 * have already been redirected to a shared queue; both idle window

12735

-+	 * and weight raising state have already been saved. Do nothing.

12736

-+	 */

12737

-+	if (bfqq->bic == NULL)

12738

-+		return;

12739

-+	if (bfqq->bic->raising_time_left)

12740

-+		/*

12741

-+		 * This is the queue of a just-started process, and would

12742

-+		 * deserve weight raising: we set raising_time_left to the full

12743

-+		 * weight-raising duration to trigger weight-raising when and

12744

-+		 * if the queue is split and the first request of the queue

12745

-+		 * is enqueued.

12746

-+		 */

12747

-+		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

12748

-+	else if (bfqq->raising_coeff > 1) {

12749

-+		unsigned long wrais_duration =

12750

-+			jiffies - bfqq->last_rais_start_finish;

12751

-+		/*

12752

-+		 * It may happen that a queue's weight raising period lasts

12753

-+		 * longer than its raising_cur_max_time, as weight raising is

12754

-+		 * handled only when a request is enqueued or dispatched (it

12755

-+		 * does not use any timer). If the weight raising period is

12756

-+		 * about to end, don't save it.

12757

-+		 */

12758

-+		if (bfqq->raising_cur_max_time <= wrais_duration)

12759

-+			bfqq->bic->raising_time_left = 0;

12760

-+		else

12761

-+			bfqq->bic->raising_time_left =

12762

-+				bfqq->raising_cur_max_time - wrais_duration;

12763

-+		/*

12764

-+		 * The bfq_queue is becoming shared or the requests of the

12765

-+		 * process owning the queue are being redirected to a shared

12766

-+		 * queue. Stop the weight raising period of the queue, as in

12767

-+		 * both cases it should not be owned by an interactive or soft

12768

-+		 * real-time application.

12769

-+		 */

12770

-+		bfq_bfqq_end_raising(bfqq);

12771

-+	} else

12772

-+		bfqq->bic->raising_time_left = 0;

12773

-+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

12774

-+}

12775

-+

12776

-+static inline void

12777

-+bfq_get_bic_reference(struct bfq_queue *bfqq)

12778

-+{

12779

-+	/*

12780

-+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

12781

-+	 * is about to begin using a shared bfq_queue.

12782

-+	 */

12783

-+	if (bfqq->bic)

12784

-+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

12785

-+}

12786

-+

12787

-+static void

12788

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

12789

-+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12790

-+{

12791

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

12792

-+		(long unsigned)new_bfqq->pid);

12793

-+	/* Save weight raising and idle window of the merged queues */

12794

-+	bfq_bfqq_save_state(bfqq);

12795

-+	bfq_bfqq_save_state(new_bfqq);

12796

-+	/*

12797

-+	 * Grab a reference to the bic, to prevent it from being destroyed

12798

-+	 * before being possibly touched by a bfq_split_bfqq().

12799

-+	 */

12800

-+	bfq_get_bic_reference(bfqq);

12801

-+	bfq_get_bic_reference(new_bfqq);

12802

-+	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

12803

-+	bic_set_bfqq(bic, new_bfqq, 1);

12804

-+	bfq_mark_bfqq_coop(new_bfqq);

12805

-+	/*

12806

-+	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

12807

-+	 * new_bfqq->bic to NULL. bfqq either:

12808

-+	 * - does not belong to any bic any more, and hence bfqq->bic must

12809

-+	 *   be set to NULL, or

12810

-+	 * - is a queue whose owning bics have already been redirected to a

12811

-+	 *   different queue, hence the queue is destined to not belong to any

12812

-+	 *   bic soon and bfqq->bic is already NULL (therefore the next

12813

-+	 *   assignment causes no harm).

12814

-+	 */

12815

-+	new_bfqq->bic = NULL;

12816

-+	bfqq->bic = NULL;

12817

-+	bfq_put_queue(bfqq);

12818

-+}

12819

-+

12820

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12821

-+			   struct bio *bio)

12822

-+{

12823

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

12824

-+	struct bfq_io_cq *bic;

12825

-+	struct bfq_queue *bfqq, *new_bfqq;

12826

-+

12827

-+	/*

12828

-+	 * Disallow merge of a sync bio into an async request.

12829

-+	 */

12830

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12831

-+		return 0;

12832

-+

12833

-+	/*

12834

-+	 * Lookup the bfqq that this bio will be queued with. Allow

12835

-+	 * merge only if rq is queued there.

12836

-+	 * Queue lock is held here.

12837

-+	 */

12838

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

12839

-+	if (bic == NULL)

12840

-+		return 0;

12841

-+

12842

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12843

-+	/*

12844

-+	 * We take advantage of this function to perform an early merge

12845

-+	 * of the queues of possible cooperating processes.

12846

-+	 */

12847

-+	if (bfqq != NULL) {

12848

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

12849

-+		if (new_bfqq != NULL) {

12850

-+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

12851

-+			/*

12852

-+			 * If we get here, the bio will be queued in the shared queue,

12853

-+			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

12854

-+			 * rq can be merged.

12855

-+			 */

12856

-+			bfqq = new_bfqq;

12857

-+		}

12858

-+	}

12859

-+

12860

-+	return bfqq == RQ_BFQQ(rq);

12861

-+}

12862

-+

12863

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

12864

-+				       struct bfq_queue *bfqq)

12865

-+{

12866

-+	if (bfqq != NULL) {

12867

-+		bfq_mark_bfqq_must_alloc(bfqq);

12868

-+		bfq_mark_bfqq_budget_new(bfqq);

12869

-+		bfq_clear_bfqq_fifo_expire(bfqq);

12870

-+

12871

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12872

-+

12873

-+		bfq_log_bfqq(bfqd, bfqq,

12874

-+			     "set_in_service_queue, cur-budget = %lu",

12875

-+			     bfqq->entity.budget);

12876

-+	}

12877

-+

12878

-+	bfqd->in_service_queue = bfqq;

12879

-+}

12880

-+

12881

-+/*

12882

-+ * Get and set a new queue for service.

12883

-+ */

12884

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

12885

-+{

12886

-+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

12887

-+

12888

-+	__bfq_set_in_service_queue(bfqd, bfqq);

12889

-+	return bfqq;

12890

-+}

12891

-+

12892

- /*

12893

-  * If enough samples have been computed, return the current max budget

12894

-  * stored in bfqd, which is dynamically updated according to the

12895

-@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

12896

- 	return rq;

12897

- }

12898

-

12899

--/*

12900

-- * Must be called with the queue_lock held.

12901

-- */

12902

--static int bfqq_process_refs(struct bfq_queue *bfqq)

12903

--{

12904

--	int process_refs, io_refs;

12905

--

12906

--	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12907

--	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12908

--	BUG_ON(process_refs < 0);

12909

--	return process_refs;

12910

--}

12911

--

12912

--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12913

--{

12914

--	int process_refs, new_process_refs;

12915

--	struct bfq_queue *__bfqq;

12916

--

12917

--	/*

12918

--	 * If there are no process references on the new_bfqq, then it is

12919

--	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12920

--	 * may have dropped their last reference (not just their last process

12921

--	 * reference).

12922

--	 */

12923

--	if (!bfqq_process_refs(new_bfqq))

12924

--		return;

12925

--

12926

--	/* Avoid a circular list and skip interim queue merges. */

12927

--	while ((__bfqq = new_bfqq->new_bfqq)) {

12928

--		if (__bfqq == bfqq)

12929

--			return;

12930

--		new_bfqq = __bfqq;

12931

--	}

12932

--

12933

--	process_refs = bfqq_process_refs(bfqq);

12934

--	new_process_refs = bfqq_process_refs(new_bfqq);

12935

--	/*

12936

--	 * If the process for the bfqq has gone away, there is no

12937

--	 * sense in merging the queues.

12938

--	 */

12939

--	if (process_refs == 0 || new_process_refs == 0)

12940

--		return;

12941

--

12942

--	/*

12943

--	 * Merge in the direction of the lesser amount of work.

12944

--	 */

12945

--	if (new_process_refs >= process_refs) {

12946

--		bfqq->new_bfqq = new_bfqq;

12947

--		atomic_add(process_refs, &new_bfqq->ref);

12948

--	} else {

12949

--		new_bfqq->new_bfqq = bfqq;

12950

--		atomic_add(new_process_refs, &bfqq->ref);

12951

--	}

12952

--	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12953

--		new_bfqq->pid);

12954

--}

12955

--

12956

- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

12957

- {

12958

- 	struct bfq_entity *entity = &bfqq->entity;

12959

-@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

12960

-  */

12961

- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12962

- {

12963

--	struct bfq_queue *bfqq, *new_bfqq = NULL;

12964

-+	struct bfq_queue *bfqq;

12965

- 	struct request *next_rq;

12966

- 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

12967

-

12968

-@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12969

-

12970

- 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

12971

-

12972

--	/*

12973

--         * If another queue has a request waiting within our mean seek

12974

--         * distance, let it run. The expire code will check for close

12975

--         * cooperators and put the close queue at the front of the

12976

--         * service tree. If possible, merge the expiring queue with the

12977

--         * new bfqq.

12978

--         */

12979

--        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

12980

--        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

12981

--                bfq_setup_merge(bfqq, new_bfqq);

12982

--

12983

- 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

12984

- 	    !timer_pending(&bfqd->idle_slice_timer) &&

12985

- 	    !bfq_bfqq_must_idle(bfqq))

12986

-@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12987

- 				bfq_clear_bfqq_wait_request(bfqq);

12988

- 				del_timer(&bfqd->idle_slice_timer);

12989

- 			}

12990

--			if (new_bfqq == NULL)

12991

--				goto keep_queue;

12992

--			else

12993

--				goto expire;

12994

-+			goto keep_queue;

12995

- 		}

12996

- 	}

12997

-

12998

- 	/*

12999

--	 * No requests pending.  If the in-service queue has no cooperator and

13000

--	 * still has requests in flight (possibly waiting for a completion)

13001

--	 * or is idling for a new request, then keep it.

13002

-+	 * No requests pending.  If the in-service queue still has requests in

13003

-+	 * flight (possibly waiting for a completion) or is idling for a new

13004

-+	 * request, then keep it.

13005

- 	 */

13006

--	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

13007

--	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

13008

-+	if (timer_pending(&bfqd->idle_slice_timer) ||

13009

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

13010

- 		bfqq = NULL;

13011

- 		goto keep_queue;

13012

--	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

13013

--		/*

13014

--		 * Expiring the queue because there is a close cooperator,

13015

--		 * cancel timer.

13016

--		 */

13017

--		bfq_clear_bfqq_wait_request(bfqq);

13018

--		del_timer(&bfqd->idle_slice_timer);

13019

- 	}

13020

-

13021

- 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

13022

- expire:

13023

- 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

13024

- new_queue:

13025

--	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

13026

-+	bfqq = bfq_set_in_service_queue(bfqd);

13027

- 	bfq_log(bfqd, "select_queue: new queue %d returned",

13028

- 		bfqq != NULL ? bfqq->pid : 0);

13029

- keep_queue:

13030

-@@ -1799,9 +1989,8 @@ keep_queue:

13031

- static void bfq_update_raising_data(struct bfq_data *bfqd,

13032

- 				    struct bfq_queue *bfqq)

13033

- {

13034

-+	struct bfq_entity *entity = &bfqq->entity;

13035

- 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

13036

--		struct bfq_entity *entity = &bfqq->entity;

13037

--

13038

- 		bfq_log_bfqq(bfqd, bfqq,

13039

- 			"raising period dur %u/%u msec, "

13040

- 			"old raising coeff %u, w %d(%d)",

13041

-@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13042

- 			"WARN: pending prio change");

13043

- 		/*

13044

- 		 * If too much time has elapsed from the beginning

13045

--		 * of this weight-raising, stop it.

13046

-+		 * of this weight-raising period, stop it.

13047

- 		 */

13048

- 		if (jiffies - bfqq->last_rais_start_finish >

13049

- 			bfqq->raising_cur_max_time) {

13050

-@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13051

- 				     jiffies_to_msecs(bfqq->

13052

- 					raising_cur_max_time));

13053

- 			bfq_bfqq_end_raising(bfqq);

13054

--			__bfq_entity_update_weight_prio(

13055

--				bfq_entity_service_tree(entity),

13056

--				entity);

13057

- 		}

13058

- 	}

13059

-+	/* Update weight both if it must be raised and if it must be lowered */

13060

-+	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

13061

-+		__bfq_entity_update_weight_prio(

13062

-+			bfq_entity_service_tree(entity),

13063

-+			entity);

13064

- }

13065

-

13066

- /*

13067

-@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)

13068

- 	struct bfq_io_cq *bic = icq_to_bic(icq);

13069

-

13070

- 	bic->ttime.last_end_request = jiffies;

13071

-+	/*

13072

-+	 * A newly created bic indicates that the process has just

13073

-+	 * started doing I/O, and is probably mapping into memory its

13074

-+	 * executable and libraries: it definitely needs weight raising.

13075

-+	 * There is however the possibility that the process performs,

13076

-+	 * for a while, I/O close to some other process. EQM intercepts

13077

-+	 * this behavior and may merge the queue corresponding to the

13078

-+	 * process  with some other queue, BEFORE the weight of the queue

13079

-+	 * is raised. Merged queues are not weight-raised (they are assumed

13080

-+	 * to belong to processes that benefit only from high throughput).

13081

-+	 * If the merge is basically the consequence of an accident, then

13082

-+	 * the queue will be split soon and will get back its old weight.

13083

-+	 * It is then important to write down somewhere that this queue

13084

-+	 * does need weight raising, even if it did not make it to get its

13085

-+	 * weight raised before being merged. To this purpose, we overload

13086

-+	 * the field raising_time_left and assign 1 to it, to mark the queue

13087

-+	 * as needing weight raising.

13088

-+	 */

13089

-+	bic->raising_time_left = 1;

13090

- }

13091

-

13092

- static void bfq_exit_icq(struct io_cq *icq)

13093

-@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)

13094

- 	}

13095

-

13096

- 	if (bic->bfqq[BLK_RW_SYNC]) {

13097

-+		/*

13098

-+		 * If the bic is using a shared queue, put the reference

13099

-+		 * taken on the io_context when the bic started using a

13100

-+		 * shared bfq_queue.

13101

-+		 */

13102

-+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

13103

-+			put_io_context(icq->ioc);

13104

- 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

13105

- 		bic->bfqq[BLK_RW_SYNC] = NULL;

13106

- 	}

13107

-@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

13108

- 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

13109

- 		return;

13110

-

13111

-+	/* Idle window just restored, statistics are meaningless. */

13112

-+	if (bfq_bfqq_just_split(bfqq))

13113

-+		return;

13114

-+

13115

- 	enable_idle = bfq_bfqq_idle_window(bfqq);

13116

-

13117

- 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

13118

-@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13119

- 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

13120

- 	    !BFQQ_SEEKY(bfqq))

13121

- 		bfq_update_idle_window(bfqd, bfqq, bic);

13122

-+	bfq_clear_bfqq_just_split(bfqq);

13123

-

13124

- 	bfq_log_bfqq(bfqd, bfqq,

13125

- 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

13126

-@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13127

- static void bfq_insert_request(struct request_queue *q, struct request *rq)

13128

- {

13129

- 	struct bfq_data *bfqd = q->elevator->elevator_data;

13130

--	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13131

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

13132

-

13133

- 	assert_spin_locked(bfqd->queue->queue_lock);

13134

-+

13135

-+	/*

13136

-+	 * An unplug may trigger a requeue of a request from the device

13137

-+	 * driver: make sure we are in process context while trying to

13138

-+	 * merge two bfq_queues.

13139

-+	 */

13140

-+	if (!in_interrupt()) {

13141

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

13142

-+		if (new_bfqq != NULL) {

13143

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

13144

-+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

13145

-+			/*

13146

-+			 * Release the request's reference to the old bfqq

13147

-+			 * and make sure one is taken to the shared queue.

13148

-+			 */

13149

-+			new_bfqq->allocated[rq_data_dir(rq)]++;

13150

-+			bfqq->allocated[rq_data_dir(rq)]--;

13151

-+			atomic_inc(&new_bfqq->ref);

13152

-+			bfq_put_queue(bfqq);

13153

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

13154

-+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

13155

-+						bfqq, new_bfqq);

13156

-+			rq->elv.priv[1] = new_bfqq;

13157

-+			bfqq = new_bfqq;

13158

-+		}

13159

-+	}

13160

-+

13161

- 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

13162

-

13163

- 	bfq_add_rq_rb(rq);

13164

-

13165

-+	/*

13166

-+	 * Here a newly-created bfq_queue has already started a weight-raising

13167

-+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

13168

-+	 * from assigning it a full weight-raising period. See the detailed

13169

-+	 * comments about this field in bfq_init_icq().

13170

-+	 */

13171

-+	if (bfqq->bic != NULL)

13172

-+		bfqq->bic->raising_time_left = 0;

13173

- 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

13174

- 	list_add_tail(&rq->queuelist, &bfqq->fifo);

13175

-

13176

-@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)

13177

- 	}

13178

- }

13179

-

13180

--static struct bfq_queue *

13181

--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13182

--		struct bfq_queue *bfqq)

13183

--{

13184

--	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13185

--		(long unsigned)bfqq->new_bfqq->pid);

13186

--	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

13187

--	bfq_mark_bfqq_coop(bfqq->new_bfqq);

13188

--	bfq_put_queue(bfqq);

13189

--	return bic_to_bfqq(bic, 1);

13190

--}

13191

--

13192

- /*

13193

-  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

13194

-  * was the last process referring to said bfqq.

13195

-@@ -2649,6 +2894,9 @@ static struct bfq_queue *

13196

- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

13197

- {

13198

- 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

13199

-+

13200

-+	put_io_context(bic->icq.ioc);

13201

-+

13202

- 	if (bfqq_process_refs(bfqq) == 1) {

13203

- 		bfqq->pid = current->pid;

13204

- 		bfq_clear_bfqq_coop(bfqq);

13205

-@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

13206

- 	struct bfq_queue *bfqq;

13207

- 	struct bfq_group *bfqg;

13208

- 	unsigned long flags;

13209

-+	bool split = false;

13210

-

13211

- 	might_sleep_if(gfp_mask & __GFP_WAIT);

13212

-

13213

-@@ -2695,24 +2944,14 @@ new_queue:

13214

- 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

13215

- 		bic_set_bfqq(bic, bfqq, is_sync);

13216

- 	} else {

13217

--		/*

13218

--		 * If the queue was seeky for too long, break it apart.

13219

--		 */

13220

-+		/* If the queue was seeky for too long, break it apart. */

13221

- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

13222

- 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

13223

- 			bfqq = bfq_split_bfqq(bic, bfqq);

13224

-+			split = true;

13225

- 			if (!bfqq)

13226

- 				goto new_queue;

13227

- 		}

13228

--

13229

--		/*

13230

--		 * Check to see if this queue is scheduled to merge with

13231

--		 * another closely cooperating queue. The merging of queues

13232

--		 * happens here as it must be done in process context.

13233

--		 * The reference on new_bfqq was taken in merge_bfqqs.

13234

--		 */

13235

--		if (bfqq->new_bfqq != NULL)

13236

--			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

13237

- 	}

13238

-

13239

- 	bfqq->allocated[rw]++;

13240

-@@ -2723,6 +2962,26 @@ new_queue:

13241

- 	rq->elv.priv[0] = bic;

13242

- 	rq->elv.priv[1] = bfqq;

13243

-

13244

-+	/*

13245

-+	 * If a bfq_queue has only one process reference, it is owned

13246

-+	 * by only one bfq_io_cq: we can set the bic field of the

13247

-+	 * bfq_queue to the address of that structure. Also, if the

13248

-+	 * queue has just been split, mark a flag so that the

13249

-+	 * information is available to the other scheduler hooks.

13250

-+	 */

13251

-+	if (bfqq_process_refs(bfqq) == 1) {

13252

-+		bfqq->bic = bic;

13253

-+		if (split) {

13254

-+			bfq_mark_bfqq_just_split(bfqq);

13255

-+			/*

13256

-+			 * If the queue has just been split from a shared queue,

13257

-+			 * restore the idle window and the possible weight

13258

-+			 * raising period.

13259

-+			 */

13260

-+			bfq_bfqq_resume_state(bfqq, bic);

13261

-+		}

13262

-+	}

13263

-+

13264

- 	spin_unlock_irqrestore(q->queue_lock, flags);

13265

-

13266

- 	return 0;

13267

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

13268

-index 30df81c..47e66a8 100644

13269

---- a/block/bfq-sched.c

13270

-+++ b/block/bfq-sched.c

13271

-@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

13272

- 	return bfqq;

13273

- }

13274

-

13275

--/*

13276

-- * Forced extraction of the given queue.

13277

-- */

13278

--static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

13279

--				      struct bfq_queue *bfqq)

13280

--{

13281

--	struct bfq_entity *entity;

13282

--	struct bfq_sched_data *sd;

13283

--

13284

--	BUG_ON(bfqd->in_service_queue != NULL);

13285

--

13286

--	entity = &bfqq->entity;

13287

--	/*

13288

--	 * Bubble up extraction/update from the leaf to the root.

13289

--	*/

13290

--	for_each_entity(entity) {

13291

--		sd = entity->sched_data;

13292

--		bfq_update_budget(entity);

13293

--		bfq_update_vtime(bfq_entity_service_tree(entity));

13294

--		bfq_active_extract(bfq_entity_service_tree(entity), entity);

13295

--		sd->active_entity = entity;

13296

--		sd->next_active = NULL;

13297

--		entity->service = 0;

13298

--	}

13299

--

13300

--	return;

13301

--}

13302

--

13303

- static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

13304

- {

13305

- 	if (bfqd->in_service_bic != NULL) {

13306

-diff --git a/block/bfq.h b/block/bfq.h

13307

-index 68b28e3..438f560 100644

13308

---- a/block/bfq.h

13309

-+++ b/block/bfq.h

13310

-@@ -192,6 +192,8 @@ struct bfq_group;

13311

-  *                      idle to backlogged

13312

-  * @service_from_backlogged: cumulative service received from the @bfq_queue

13313

-  *                           since the last transition from idle to backlogged

13314

-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

13315

-+ *	 queue is shared

13316

-  *

13317

-  * A bfq_queue is a leaf request queue; it can be associated to an io_context

13318

-  * or more (if it is an async one).  @cgroup holds a reference to the

13319

-@@ -235,6 +237,7 @@ struct bfq_queue {

13320

- 	sector_t last_request_pos;

13321

-

13322

- 	pid_t pid;

13323

-+	struct bfq_io_cq *bic;

13324

-

13325

- 	/* weight-raising fields */

13326

- 	unsigned int raising_cur_max_time;

13327

-@@ -264,12 +267,23 @@ struct bfq_ttime {

13328

-  * @icq: associated io_cq structure

13329

-  * @bfqq: array of two process queues, the sync and the async

13330

-  * @ttime: associated @bfq_ttime struct

13331

-+ * @raising_time_left: snapshot of the time left before weight raising ends

13332

-+ *		       for the sync queue associated to this process; this

13333

-+ *		       snapshot is taken to remember this value while the weight

13334

-+ *		       raising is suspended because the queue is merged with a

13335

-+ *		       shared queue, and is used to set @raising_cur_max_time

13336

-+ *		       when the queue is split from the shared queue and its

13337

-+ *		       weight is raised again

13338

-+ * @saved_idle_window: same purpose as the previous field for the idle window

13339

-  */

13340

- struct bfq_io_cq {

13341

- 	struct io_cq icq; /* must be the first member */

13342

- 	struct bfq_queue *bfqq[2];

13343

- 	struct bfq_ttime ttime;

13344

- 	int ioprio;

13345

-+

13346

-+	unsigned int raising_time_left;

13347

-+	unsigned int saved_idle_window;

13348

- };

13349

-

13350

- /**

13351

-@@ -411,6 +425,7 @@ enum bfqq_state_flags {

13352

- 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

13353

- 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

13354

- 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

13355

-+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

13356

- 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

13357

- };

13358

-

13359

-@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

13360

- BFQ_BFQQ_FNS(budget_new);

13361

- BFQ_BFQQ_FNS(coop);

13362

- BFQ_BFQQ_FNS(split_coop);

13363

-+BFQ_BFQQ_FNS(just_split);

13364

- BFQ_BFQQ_FNS(softrt_update);

13365

- #undef BFQ_BFQQ_FNS

13366

-

13367

---

13368

-1.8.5.2

13369

-

13370

13371

Added: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

13372

===================================================================

13373

--- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	                        (rev 0)

13374

+++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	2014-02-07 15:42:35 UTC (rev 2666)

13375

@@ -0,0 +1,1034 @@

13376

+From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001

13377

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

13378

+Date: Thu, 23 Jan 2014 16:54:44 +0100

13379

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for

13380

+ 3.13.0

13381

+

13382

+A set of processes may happen  to  perform interleaved reads, i.e., requests

13383

+whose union would give rise to a  sequential read  pattern.  There are two

13384

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

13385

+data at a fixed distance from each other, while in the second case processes

13386

+may read variable-size chunks at  variable distances. The latter case occurs

13387

+for  example with  KVM, which  splits the  I/O generated  by the  guest into

13388

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

13389

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

13390

+available  process. CFQ  uses actual  queue merging  for the  first type  of

13391

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

13392

+of the read requests  performed by the second type of  processes. In the end

13393

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

13394

+throughput with interleaved I/O.

13395

+

13396

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

13397

+sequential  read pattern  with both  types of  processes. The  main idea  is

13398

+checking newly arrived requests against the next request of the active queue

13399

+both in case of actual request insert and in case of request merge. By doing

13400

+so, both the types of processes can be handled by just merging their queues.

13401

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

13402

+CFQ.

13403

+

13404

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

13405

+properly restoring the weight-raising state of  a queue when it gets back to

13406

+a non-merged state.

13407

+

13408

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

13409

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

13410

+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

13411

+---

13412

+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

13413

+ block/bfq-sched.c   |  28 ---

13414

+ block/bfq.h         |  16 ++

13415

+ 3 files changed, 474 insertions(+), 227 deletions(-)

13416

+

13417

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

13418

+index eb760de..06ee844 100644

13419

+--- a/block/bfq-iosched.c

13420

++++ b/block/bfq-iosched.c

13421

+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

13422

+ 	return dur;

13423

+ }

13424

+

13425

++static inline void

13426

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

13427

++{

13428

++	if (bic->saved_idle_window)

13429

++		bfq_mark_bfqq_idle_window(bfqq);

13430

++	else

13431

++		bfq_clear_bfqq_idle_window(bfqq);

13432

++	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

13433

++		/*

13434

++		 * Start a weight raising period with the duration given by

13435

++		 * the raising_time_left snapshot.

13436

++		 */

13437

++		if (bfq_bfqq_busy(bfqq))

13438

++			bfqq->bfqd->raised_busy_queues++;

13439

++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

13440

++		bfqq->raising_cur_max_time = bic->raising_time_left;

13441

++		bfqq->last_rais_start_finish = jiffies;

13442

++		bfqq->entity.ioprio_changed = 1;

13443

++	}

13444

++	/*

13445

++	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

13446

++	 * getting confused about the queue's need of a weight-raising

13447

++	 * period.

13448

++	 */

13449

++	bic->raising_time_left = 0;

13450

++}

13451

++

13452

++/*

13453

++ * Must be called with the queue_lock held.

13454

++ */

13455

++static int bfqq_process_refs(struct bfq_queue *bfqq)

13456

++{

13457

++	int process_refs, io_refs;

13458

++

13459

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13460

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13461

++	BUG_ON(process_refs < 0);

13462

++	return process_refs;

13463

++}

13464

++

13465

+ static void bfq_add_rq_rb(struct request *rq)

13466

+ {

13467

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13468

+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

13469

+ 		if (!bfqd->low_latency)

13470

+ 			goto add_bfqq_busy;

13471

+

13472

++		if (bfq_bfqq_just_split(bfqq))

13473

++			goto set_ioprio_changed;

13474

++

13475

+ 		/*

13476

+-		 * If the queue is not being boosted and has been idle

13477

+-		 * for enough time, start a weight-raising period

13478

++		 * If the queue:

13479

++		 * - is not being boosted,

13480

++		 * - has been idle for enough time,

13481

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

13482

++		 *   shared "for its nature" or it is not shared and its

13483

++		 *   requests have not been redirected to a shared queue)

13484

++		 * start a weight-raising period.

13485

+ 		 */

13486

+-		if (old_raising_coeff == 1 &&

13487

+-		    (idle_for_long_time || soft_rt)) {

13488

++		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

13489

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

13490

+ 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

13491

+ 			if (idle_for_long_time)

13492

+ 				bfqq->raising_cur_max_time =

13493

+@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)

13494

+ 					bfqd->bfq_raising_rt_max_time;

13495

+ 			}

13496

+ 		}

13497

++set_ioprio_changed:

13498

+ 		if (old_raising_coeff != bfqq->raising_coeff)

13499

+ 			entity->ioprio_changed = 1;

13500

+ add_bfqq_busy:

13501

+@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

13502

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

13503

+ }

13504

+

13505

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13506

+-			   struct bio *bio)

13507

+-{

13508

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

13509

+-	struct bfq_io_cq *bic;

13510

+-	struct bfq_queue *bfqq;

13511

+-

13512

+-	/*

13513

+-	 * Disallow merge of a sync bio into an async request.

13514

+-	 */

13515

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13516

+-		return 0;

13517

+-

13518

+-	/*

13519

+-	 * Lookup the bfqq that this bio will be queued with. Allow

13520

+-	 * merge only if rq is queued there.

13521

+-	 * Queue lock is held here.

13522

+-	 */

13523

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

13524

+-	if (bic == NULL)

13525

+-		return 0;

13526

+-

13527

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13528

+-	return bfqq == RQ_BFQQ(rq);

13529

+-}

13530

+-

13531

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13532

+-				       struct bfq_queue *bfqq)

13533

+-{

13534

+-	if (bfqq != NULL) {

13535

+-		bfq_mark_bfqq_must_alloc(bfqq);

13536

+-		bfq_mark_bfqq_budget_new(bfqq);

13537

+-		bfq_clear_bfqq_fifo_expire(bfqq);

13538

+-

13539

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13540

+-

13541

+-		bfq_log_bfqq(bfqd, bfqq,

13542

+-			     "set_in_service_queue, cur-budget = %lu",

13543

+-			     bfqq->entity.budget);

13544

+-	}

13545

+-

13546

+-	bfqd->in_service_queue = bfqq;

13547

+-}

13548

+-

13549

+-/*

13550

+- * Get and set a new queue for service.

13551

+- */

13552

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

13553

+-						  struct bfq_queue *bfqq)

13554

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

13555

+ {

13556

+-	if (!bfqq)

13557

+-		bfqq = bfq_get_next_queue(bfqd);

13558

++	if (request)

13559

++		return blk_rq_pos(io_struct);

13560

+ 	else

13561

+-		bfq_get_next_queue_forced(bfqd, bfqq);

13562

+-

13563

+-	__bfq_set_in_service_queue(bfqd, bfqq);

13564

+-	return bfqq;

13565

++		return ((struct bio *)io_struct)->bi_sector;

13566

+ }

13567

+

13568

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

13569

+-					  struct request *rq)

13570

++static inline sector_t bfq_dist_from(sector_t pos1,

13571

++				     sector_t pos2)

13572

+ {

13573

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

13574

+-		return blk_rq_pos(rq) - bfqd->last_position;

13575

++	if (pos1 >= pos2)

13576

++		return pos1 - pos2;

13577

+ 	else

13578

+-		return bfqd->last_position - blk_rq_pos(rq);

13579

++		return pos2 - pos1;

13580

+ }

13581

+

13582

+-/*

13583

+- * Return true if bfqq has no request pending and rq is close enough to

13584

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

13585

+- * bfqq->next_rq

13586

+- */

13587

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

13588

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

13589

++					 sector_t sector)

13590

+ {

13591

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

13592

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

13593

++	       BFQQ_SEEK_THR;

13594

+ }

13595

+

13596

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13597

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

13598

+ {

13599

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

13600

+ 	struct rb_node *parent, *node;

13601

+ 	struct bfq_queue *__bfqq;

13602

+-	sector_t sector = bfqd->last_position;

13603

+

13604

+ 	if (RB_EMPTY_ROOT(root))

13605

+ 		return NULL;

13606

+@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13607

+ 	 * position).

13608

+ 	 */

13609

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

13610

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13611

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13612

+ 		return __bfqq;

13613

+

13614

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

13615

+@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13616

+ 		return NULL;

13617

+

13618

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

13619

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13620

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13621

+ 		return __bfqq;

13622

+

13623

+ 	return NULL;

13624

+@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13625

+ /*

13626

+  * bfqd - obvious

13627

+  * cur_bfqq - passed in so that we don't decide that the current queue

13628

+- *            is closely cooperating with itself.

13629

+- *

13630

+- * We are assuming that cur_bfqq has dispatched at least one request,

13631

+- * and that bfqd->last_position reflects a position on the disk associated

13632

+- * with the I/O issued by cur_bfqq.

13633

++ *            is closely cooperating with itself

13634

++ * sector - used as a reference point to search for a close queue

13635

+  */

13636

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13637

+-					      struct bfq_queue *cur_bfqq)

13638

++					      struct bfq_queue *cur_bfqq,

13639

++					      sector_t sector)

13640

+ {

13641

+ 	struct bfq_queue *bfqq;

13642

+

13643

+@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13644

+ 	 * working closely on the same area of the disk. In that case,

13645

+ 	 * we can group them together and don't waste time idling.

13646

+ 	 */

13647

+-	bfqq = bfqq_close(bfqd);

13648

++	bfqq = bfqq_close(bfqd, sector);

13649

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

13650

+ 		return NULL;

13651

+

13652

+@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13653

+ 	return bfqq;

13654

+ }

13655

+

13656

++static struct bfq_queue *

13657

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13658

++{

13659

++	int process_refs, new_process_refs;

13660

++	struct bfq_queue *__bfqq;

13661

++

13662

++	/*

13663

++	 * If there are no process references on the new_bfqq, then it is

13664

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13665

++	 * may have dropped their last reference (not just their last process

13666

++	 * reference).

13667

++	 */

13668

++	if (!bfqq_process_refs(new_bfqq))

13669

++		return NULL;

13670

++

13671

++	/* Avoid a circular list and skip interim queue merges. */

13672

++	while ((__bfqq = new_bfqq->new_bfqq)) {

13673

++		if (__bfqq == bfqq)

13674

++			return NULL;

13675

++		new_bfqq = __bfqq;

13676

++	}

13677

++

13678

++	process_refs = bfqq_process_refs(bfqq);

13679

++	new_process_refs = bfqq_process_refs(new_bfqq);

13680

++	/*

13681

++	 * If the process for the bfqq has gone away, there is no

13682

++	 * sense in merging the queues.

13683

++	 */

13684

++	if (process_refs == 0 || new_process_refs == 0)

13685

++		return NULL;

13686

++

13687

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13688

++		new_bfqq->pid);

13689

++

13690

++	/*

13691

++	 * Merging is just a redirection: the requests of the process owning

13692

++	 * one of the two queues are redirected to the other queue. The latter

13693

++	 * queue, in its turn, is set as shared if this is the first time that

13694

++	 * the requests of some process are redirected to it.

13695

++	 *

13696

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

13697

++	 * are in the context of the process owning bfqq, hence we have the

13698

++	 * io_cq of this process. So we can immediately configure this io_cq

13699

++	 * to redirect the requests of the process to new_bfqq.

13700

++	 *

13701

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

13702

++	 * io_cq of new_bfqq is not available, because, if the in-service queue

13703

++	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

13704

++	 * in-service queue.

13705

++	 * Redirecting the requests of the process owning bfqq to the currently

13706

++	 * in-service queue is in any case the best option, as we feed the

13707

++	 * in-service queue with new requests close to the last request served

13708

++	 * and, by doing so, hopefully increase the throughput.

13709

++	 */

13710

++	bfqq->new_bfqq = new_bfqq;

13711

++	atomic_add(process_refs, &new_bfqq->ref);

13712

++	return new_bfqq;

13713

++}

13714

++

13715

++/*

13716

++ * Attempt to schedule a merge of bfqq with the currently in-service queue or

13717

++ * with a close queue among the scheduled queues.

13718

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

13719

++ * structure otherwise.

13720

++ */

13721

++static struct bfq_queue *

13722

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13723

++		     void *io_struct, bool request)

13724

++{

13725

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

13726

++

13727

++	if (bfqq->new_bfqq)

13728

++		return bfqq->new_bfqq;

13729

++

13730

++	if (!io_struct)

13731

++		return NULL;

13732

++

13733

++	in_service_bfqq = bfqd->in_service_queue;

13734

++

13735

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

13736

++	    !bfqd->in_service_bic)

13737

++		goto check_scheduled;

13738

++

13739

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

13740

++		goto check_scheduled;

13741

++

13742

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

13743

++		goto check_scheduled;

13744

++

13745

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

13746

++		goto check_scheduled;

13747

++

13748

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

13749

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

13750

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

13751

++		if (new_bfqq != NULL)

13752

++			return new_bfqq; /* Merge with the in-service queue */

13753

++	}

13754

++

13755

++	/*

13756

++	 * Check whether there is a cooperator among currently scheduled

13757

++	 * queues. The only thing we need is that the bio/request is not

13758

++	 * NULL, as we need it to establish whether a cooperator exists.

13759

++	 */

13760

++check_scheduled:

13761

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

13762

++					bfq_io_struct_pos(io_struct, request));

13763

++	if (new_bfqq)

13764

++		return bfq_setup_merge(bfqq, new_bfqq);

13765

++

13766

++	return NULL;

13767

++}

13768

++

13769

++static inline void

13770

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

13771

++{

13772

++	/*

13773

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

13774

++	 * have already been redirected to a shared queue; both idle window

13775

++	 * and weight raising state have already been saved. Do nothing.

13776

++	 */

13777

++	if (bfqq->bic == NULL)

13778

++		return;

13779

++	if (bfqq->bic->raising_time_left)

13780

++		/*

13781

++		 * This is the queue of a just-started process, and would

13782

++		 * deserve weight raising: we set raising_time_left to the full

13783

++		 * weight-raising duration to trigger weight-raising when and

13784

++		 * if the queue is split and the first request of the queue

13785

++		 * is enqueued.

13786

++		 */

13787

++		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

13788

++	else if (bfqq->raising_coeff > 1) {

13789

++		unsigned long wrais_duration =

13790

++			jiffies - bfqq->last_rais_start_finish;

13791

++		/*

13792

++		 * It may happen that a queue's weight raising period lasts

13793

++		 * longer than its raising_cur_max_time, as weight raising is

13794

++		 * handled only when a request is enqueued or dispatched (it

13795

++		 * does not use any timer). If the weight raising period is

13796

++		 * about to end, don't save it.

13797

++		 */

13798

++		if (bfqq->raising_cur_max_time <= wrais_duration)

13799

++			bfqq->bic->raising_time_left = 0;

13800

++		else

13801

++			bfqq->bic->raising_time_left =

13802

++				bfqq->raising_cur_max_time - wrais_duration;

13803

++		/*

13804

++		 * The bfq_queue is becoming shared or the requests of the

13805

++		 * process owning the queue are being redirected to a shared

13806

++		 * queue. Stop the weight raising period of the queue, as in

13807

++		 * both cases it should not be owned by an interactive or soft

13808

++		 * real-time application.

13809

++		 */

13810

++		bfq_bfqq_end_raising(bfqq);

13811

++	} else

13812

++		bfqq->bic->raising_time_left = 0;

13813

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

13814

++}

13815

++

13816

++static inline void

13817

++bfq_get_bic_reference(struct bfq_queue *bfqq)

13818

++{

13819

++	/*

13820

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

13821

++	 * is about to begin using a shared bfq_queue.

13822

++	 */

13823

++	if (bfqq->bic)

13824

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

13825

++}

13826

++

13827

++static void

13828

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13829

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13830

++{

13831

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13832

++		(long unsigned)new_bfqq->pid);

13833

++	/* Save weight raising and idle window of the merged queues */

13834

++	bfq_bfqq_save_state(bfqq);

13835

++	bfq_bfqq_save_state(new_bfqq);

13836

++	/*

13837

++	 * Grab a reference to the bic, to prevent it from being destroyed

13838

++	 * before being possibly touched by a bfq_split_bfqq().

13839

++	 */

13840

++	bfq_get_bic_reference(bfqq);

13841

++	bfq_get_bic_reference(new_bfqq);

13842

++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

13843

++	bic_set_bfqq(bic, new_bfqq, 1);

13844

++	bfq_mark_bfqq_coop(new_bfqq);

13845

++	/*

13846

++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

13847

++	 * new_bfqq->bic to NULL. bfqq either:

13848

++	 * - does not belong to any bic any more, and hence bfqq->bic must

13849

++	 *   be set to NULL, or

13850

++	 * - is a queue whose owning bics have already been redirected to a

13851

++	 *   different queue, hence the queue is destined to not belong to any

13852

++	 *   bic soon and bfqq->bic is already NULL (therefore the next

13853

++	 *   assignment causes no harm).

13854

++	 */

13855

++	new_bfqq->bic = NULL;

13856

++	bfqq->bic = NULL;

13857

++	bfq_put_queue(bfqq);

13858

++}

13859

++

13860

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13861

++			   struct bio *bio)

13862

++{

13863

++	struct bfq_data *bfqd = q->elevator->elevator_data;

13864

++	struct bfq_io_cq *bic;

13865

++	struct bfq_queue *bfqq, *new_bfqq;

13866

++

13867

++	/*

13868

++	 * Disallow merge of a sync bio into an async request.

13869

++	 */

13870

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13871

++		return 0;

13872

++

13873

++	/*

13874

++	 * Lookup the bfqq that this bio will be queued with. Allow

13875

++	 * merge only if rq is queued there.

13876

++	 * Queue lock is held here.

13877

++	 */

13878

++	bic = bfq_bic_lookup(bfqd, current->io_context);

13879

++	if (bic == NULL)

13880

++		return 0;

13881

++

13882

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13883

++	/*

13884

++	 * We take advantage of this function to perform an early merge

13885

++	 * of the queues of possible cooperating processes.

13886

++	 */

13887

++	if (bfqq != NULL) {

13888

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

13889

++		if (new_bfqq != NULL) {

13890

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

13891

++			/*

13892

++			 * If we get here, the bio will be queued in the shared queue,

13893

++			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

13894

++			 * rq can be merged.

13895

++			 */

13896

++			bfqq = new_bfqq;

13897

++		}

13898

++	}

13899

++

13900

++	return bfqq == RQ_BFQQ(rq);

13901

++}

13902

++

13903

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13904

++				       struct bfq_queue *bfqq)

13905

++{

13906

++	if (bfqq != NULL) {

13907

++		bfq_mark_bfqq_must_alloc(bfqq);

13908

++		bfq_mark_bfqq_budget_new(bfqq);

13909

++		bfq_clear_bfqq_fifo_expire(bfqq);

13910

++

13911

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13912

++

13913

++		bfq_log_bfqq(bfqd, bfqq,

13914

++			     "set_in_service_queue, cur-budget = %lu",

13915

++			     bfqq->entity.budget);

13916

++	}

13917

++

13918

++	bfqd->in_service_queue = bfqq;

13919

++}

13920

++

13921

++/*

13922

++ * Get and set a new queue for service.

13923

++ */

13924

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

13925

++{

13926

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

13927

++

13928

++	__bfq_set_in_service_queue(bfqd, bfqq);

13929

++	return bfqq;

13930

++}

13931

++

13932

+ /*

13933

+  * If enough samples have been computed, return the current max budget

13934

+  * stored in bfqd, which is dynamically updated according to the

13935

+@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

13936

+ 	return rq;

13937

+ }

13938

+

13939

+-/*

13940

+- * Must be called with the queue_lock held.

13941

+- */

13942

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

13943

+-{

13944

+-	int process_refs, io_refs;

13945

+-

13946

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13947

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13948

+-	BUG_ON(process_refs < 0);

13949

+-	return process_refs;

13950

+-}

13951

+-

13952

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13953

+-{

13954

+-	int process_refs, new_process_refs;

13955

+-	struct bfq_queue *__bfqq;

13956

+-

13957

+-	/*

13958

+-	 * If there are no process references on the new_bfqq, then it is

13959

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13960

+-	 * may have dropped their last reference (not just their last process

13961

+-	 * reference).

13962

+-	 */

13963

+-	if (!bfqq_process_refs(new_bfqq))

13964

+-		return;

13965

+-

13966

+-	/* Avoid a circular list and skip interim queue merges. */

13967

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

13968

+-		if (__bfqq == bfqq)

13969

+-			return;

13970

+-		new_bfqq = __bfqq;

13971

+-	}

13972

+-

13973

+-	process_refs = bfqq_process_refs(bfqq);

13974

+-	new_process_refs = bfqq_process_refs(new_bfqq);

13975

+-	/*

13976

+-	 * If the process for the bfqq has gone away, there is no

13977

+-	 * sense in merging the queues.

13978

+-	 */

13979

+-	if (process_refs == 0 || new_process_refs == 0)

13980

+-		return;

13981

+-

13982

+-	/*

13983

+-	 * Merge in the direction of the lesser amount of work.

13984

+-	 */

13985

+-	if (new_process_refs >= process_refs) {

13986

+-		bfqq->new_bfqq = new_bfqq;

13987

+-		atomic_add(process_refs, &new_bfqq->ref);

13988

+-	} else {

13989

+-		new_bfqq->new_bfqq = bfqq;

13990

+-		atomic_add(new_process_refs, &bfqq->ref);

13991

+-	}

13992

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13993

+-		new_bfqq->pid);

13994

+-}

13995

+-

13996

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

13997

+ {

13998

+ 	struct bfq_entity *entity = &bfqq->entity;

13999

+@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

14000

+  */

14001

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

14002

+ {

14003

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

14004

++	struct bfq_queue *bfqq;

14005

+ 	struct request *next_rq;

14006

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

14007

+

14008

+@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

14009

+

14010

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

14011

+

14012

+-	/*

14013

+-         * If another queue has a request waiting within our mean seek

14014

+-         * distance, let it run. The expire code will check for close

14015

+-         * cooperators and put the close queue at the front of the

14016

+-         * service tree. If possible, merge the expiring queue with the

14017

+-         * new bfqq.

14018

+-         */

14019

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

14020

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

14021

+-                bfq_setup_merge(bfqq, new_bfqq);

14022

+-

14023

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

14024

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

14025

+ 	    !bfq_bfqq_must_idle(bfqq))

14026

+@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

14027

+ 				bfq_clear_bfqq_wait_request(bfqq);

14028

+ 				del_timer(&bfqd->idle_slice_timer);

14029

+ 			}

14030

+-			if (new_bfqq == NULL)

14031

+-				goto keep_queue;

14032

+-			else

14033

+-				goto expire;

14034

++			goto keep_queue;

14035

+ 		}

14036

+ 	}

14037

+

14038

+ 	/*

14039

+-	 * No requests pending.  If the in-service queue has no cooperator and

14040

+-	 * still has requests in flight (possibly waiting for a completion)

14041

+-	 * or is idling for a new request, then keep it.

14042

++	 * No requests pending.  If the in-service queue still has requests in

14043

++	 * flight (possibly waiting for a completion) or is idling for a new

14044

++	 * request, then keep it.

14045

+ 	 */

14046

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

14047

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

14048

++	if (timer_pending(&bfqd->idle_slice_timer) ||

14049

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

14050

+ 		bfqq = NULL;

14051

+ 		goto keep_queue;

14052

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

14053

+-		/*

14054

+-		 * Expiring the queue because there is a close cooperator,

14055

+-		 * cancel timer.

14056

+-		 */

14057

+-		bfq_clear_bfqq_wait_request(bfqq);

14058

+-		del_timer(&bfqd->idle_slice_timer);

14059

+ 	}

14060

+

14061

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

14062

+ expire:

14063

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

14064

+ new_queue:

14065

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

14066

++	bfqq = bfq_set_in_service_queue(bfqd);

14067

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

14068

+ 		bfqq != NULL ? bfqq->pid : 0);

14069

+ keep_queue:

14070

+@@ -1825,9 +2015,8 @@ keep_queue:

14071

+ static void bfq_update_raising_data(struct bfq_data *bfqd,

14072

+ 				    struct bfq_queue *bfqq)

14073

+ {

14074

++	struct bfq_entity *entity = &bfqq->entity;

14075

+ 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

14076

+-		struct bfq_entity *entity = &bfqq->entity;

14077

+-

14078

+ 		bfq_log_bfqq(bfqd, bfqq,

14079

+ 			"raising period dur %u/%u msec, "

14080

+ 			"old raising coeff %u, w %d(%d)",

14081

+@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

14082

+ 			"WARN: pending prio change");

14083

+ 		/*

14084

+ 		 * If too much time has elapsed from the beginning

14085

+-		 * of this weight-raising, stop it.

14086

++		 * of this weight-raising period, stop it.

14087

+ 		 */

14088

+ 		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

14089

+ 					   bfqq->raising_cur_max_time)) {

14090

+@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

14091

+ 				     jiffies_to_msecs(bfqq->

14092

+ 					raising_cur_max_time));

14093

+ 			bfq_bfqq_end_raising(bfqq);

14094

+-			__bfq_entity_update_weight_prio(

14095

+-				bfq_entity_service_tree(entity),

14096

+-				entity);

14097

+ 		}

14098

+ 	}

14099

++	/* Update weight both if it must be raised and if it must be lowered */

14100

++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

14101

++		__bfq_entity_update_weight_prio(

14102

++			bfq_entity_service_tree(entity),

14103

++			entity);

14104

+ }

14105

+

14106

+ /*

14107

+@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)

14108

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

14109

+

14110

+ 	bic->ttime.last_end_request = jiffies;

14111

++	/*

14112

++	 * A newly created bic indicates that the process has just

14113

++	 * started doing I/O, and is probably mapping into memory its

14114

++	 * executable and libraries: it definitely needs weight raising.

14115

++	 * There is however the possibility that the process performs,

14116

++	 * for a while, I/O close to some other process. EQM intercepts

14117

++	 * this behavior and may merge the queue corresponding to the

14118

++	 * process  with some other queue, BEFORE the weight of the queue

14119

++	 * is raised. Merged queues are not weight-raised (they are assumed

14120

++	 * to belong to processes that benefit only from high throughput).

14121

++	 * If the merge is basically the consequence of an accident, then

14122

++	 * the queue will be split soon and will get back its old weight.

14123

++	 * It is then important to write down somewhere that this queue

14124

++	 * does need weight raising, even if it did not make it to get its

14125

++	 * weight raised before being merged. To this purpose, we overload

14126

++	 * the field raising_time_left and assign 1 to it, to mark the queue

14127

++	 * as needing weight raising.

14128

++	 */

14129

++	bic->raising_time_left = 1;

14130

+ }

14131

+

14132

+ static void bfq_exit_icq(struct io_cq *icq)

14133

+@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)

14134

+ 	}

14135

+

14136

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

14137

++		/*

14138

++		 * If the bic is using a shared queue, put the reference

14139

++		 * taken on the io_context when the bic started using a

14140

++		 * shared bfq_queue.

14141

++		 */

14142

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

14143

++			put_io_context(icq->ioc);

14144

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

14145

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

14146

+ 	}

14147

+@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

14148

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

14149

+ 		return;

14150

+

14151

++	/* Idle window just restored, statistics are meaningless. */

14152

++	if (bfq_bfqq_just_split(bfqq))

14153

++		return;

14154

++

14155

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

14156

+

14157

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

14158

+@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14159

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

14160

+ 	    !BFQQ_SEEKY(bfqq))

14161

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

14162

++	bfq_clear_bfqq_just_split(bfqq);

14163

+

14164

+ 	bfq_log_bfqq(bfqd, bfqq,

14165

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

14166

+@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14167

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

14168

+ {

14169

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

14170

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

14171

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

14172

+

14173

+ 	assert_spin_locked(bfqd->queue->queue_lock);

14174

++

14175

++	/*

14176

++	 * An unplug may trigger a requeue of a request from the device

14177

++	 * driver: make sure we are in process context while trying to

14178

++	 * merge two bfq_queues.

14179

++	 */

14180

++	if (!in_interrupt()) {

14181

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

14182

++		if (new_bfqq != NULL) {

14183

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

14184

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

14185

++			/*

14186

++			 * Release the request's reference to the old bfqq

14187

++			 * and make sure one is taken to the shared queue.

14188

++			 */

14189

++			new_bfqq->allocated[rq_data_dir(rq)]++;

14190

++			bfqq->allocated[rq_data_dir(rq)]--;

14191

++			atomic_inc(&new_bfqq->ref);

14192

++			bfq_put_queue(bfqq);

14193

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

14194

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

14195

++						bfqq, new_bfqq);

14196

++			rq->elv.priv[1] = new_bfqq;

14197

++			bfqq = new_bfqq;

14198

++		}

14199

++	}

14200

++

14201

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

14202

+

14203

+ 	bfq_add_rq_rb(rq);

14204

+

14205

++	/*

14206

++	 * Here a newly-created bfq_queue has already started a weight-raising

14207

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

14208

++	 * from assigning it a full weight-raising period. See the detailed

14209

++	 * comments about this field in bfq_init_icq().

14210

++	 */

14211

++	if (bfqq->bic != NULL)

14212

++		bfqq->bic->raising_time_left = 0;

14213

+ 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

14214

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

14215

+

14216

+@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)

14217

+ 	}

14218

+ }

14219

+

14220

+-static struct bfq_queue *

14221

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

14222

+-		struct bfq_queue *bfqq)

14223

+-{

14224

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

14225

+-		(long unsigned)bfqq->new_bfqq->pid);

14226

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

14227

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

14228

+-	bfq_put_queue(bfqq);

14229

+-	return bic_to_bfqq(bic, 1);

14230

+-}

14231

+-

14232

+ /*

14233

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

14234

+  * was the last process referring to said bfqq.

14235

+@@ -2679,6 +2924,9 @@ static struct bfq_queue *

14236

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

14237

+ {

14238

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

14239

++

14240

++	put_io_context(bic->icq.ioc);

14241

++

14242

+ 	if (bfqq_process_refs(bfqq) == 1) {

14243

+ 		bfqq->pid = current->pid;

14244

+ 		bfq_clear_bfqq_coop(bfqq);

14245

+@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

14246

+ 	struct bfq_queue *bfqq;

14247

+ 	struct bfq_group *bfqg;

14248

+ 	unsigned long flags;

14249

++	bool split = false;

14250

+

14251

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

14252

+

14253

+@@ -2725,24 +2974,14 @@ new_queue:

14254

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

14255

+ 		bic_set_bfqq(bic, bfqq, is_sync);

14256

+ 	} else {

14257

+-		/*

14258

+-		 * If the queue was seeky for too long, break it apart.

14259

+-		 */

14260

++		/* If the queue was seeky for too long, break it apart. */

14261

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

14262

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

14263

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

14264

++			split = true;

14265

+ 			if (!bfqq)

14266

+ 				goto new_queue;

14267

+ 		}

14268

+-

14269

+-		/*

14270

+-		 * Check to see if this queue is scheduled to merge with

14271

+-		 * another closely cooperating queue. The merging of queues

14272

+-		 * happens here as it must be done in process context.

14273

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

14274

+-		 */

14275

+-		if (bfqq->new_bfqq != NULL)

14276

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

14277

+ 	}

14278

+

14279

+ 	bfqq->allocated[rw]++;

14280

+@@ -2753,6 +2992,26 @@ new_queue:

14281

+ 	rq->elv.priv[0] = bic;

14282

+ 	rq->elv.priv[1] = bfqq;

14283

+

14284

++	/*

14285

++	 * If a bfq_queue has only one process reference, it is owned

14286

++	 * by only one bfq_io_cq: we can set the bic field of the

14287

++	 * bfq_queue to the address of that structure. Also, if the

14288

++	 * queue has just been split, mark a flag so that the

14289

++	 * information is available to the other scheduler hooks.

14290

++	 */

14291

++	if (bfqq_process_refs(bfqq) == 1) {

14292

++		bfqq->bic = bic;

14293

++		if (split) {

14294

++			bfq_mark_bfqq_just_split(bfqq);

14295

++			/*

14296

++			 * If the queue has just been split from a shared queue,

14297

++			 * restore the idle window and the possible weight

14298

++			 * raising period.

14299

++			 */

14300

++			bfq_bfqq_resume_state(bfqq, bic);

14301

++		}

14302

++	}

14303

++

14304

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

14305

+

14306

+ 	return 0;

14307

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

14308

+index 999b475..e54ea33 100644

14309

+--- a/block/bfq-sched.c

14310

++++ b/block/bfq-sched.c

14311

+@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

14312

+ 	return bfqq;

14313

+ }

14314

+

14315

+-/*

14316

+- * Forced extraction of the given queue.

14317

+- */

14318

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

14319

+-				      struct bfq_queue *bfqq)

14320

+-{

14321

+-	struct bfq_entity *entity;

14322

+-	struct bfq_sched_data *sd;

14323

+-

14324

+-	BUG_ON(bfqd->in_service_queue != NULL);

14325

+-

14326

+-	entity = &bfqq->entity;

14327

+-	/*

14328

+-	 * Bubble up extraction/update from the leaf to the root.

14329

+-	*/

14330

+-	for_each_entity(entity) {

14331

+-		sd = entity->sched_data;

14332

+-		bfq_update_budget(entity);

14333

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

14334

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

14335

+-		sd->active_entity = entity;

14336

+-		sd->next_active = NULL;

14337

+-		entity->service = 0;

14338

+-	}

14339

+-

14340

+-	return;

14341

+-}

14342

+-

14343

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

14344

+ {

14345

+ 	if (bfqd->in_service_bic != NULL) {

14346

+diff --git a/block/bfq.h b/block/bfq.h

14347

+index f9b5881..0bfad40 100644

14348

+--- a/block/bfq.h

14349

++++ b/block/bfq.h

14350

+@@ -192,6 +192,8 @@ struct bfq_group;

14351

+  *                      idle to backlogged

14352

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

14353

+  *                           since the last transition from idle to backlogged

14354

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

14355

++ *	 queue is shared

14356

+  *

14357

+  * A bfq_queue is a leaf request queue; it can be associated to an io_context

14358

+  * or more (if it is an async one).  @cgroup holds a reference to the

14359

+@@ -235,6 +237,7 @@ struct bfq_queue {

14360

+ 	sector_t last_request_pos;

14361

+

14362

+ 	pid_t pid;

14363

++	struct bfq_io_cq *bic;

14364

+

14365

+ 	/* weight-raising fields */

14366

+ 	unsigned long raising_cur_max_time;

14367

+@@ -264,12 +267,23 @@ struct bfq_ttime {

14368

+  * @icq: associated io_cq structure

14369

+  * @bfqq: array of two process queues, the sync and the async

14370

+  * @ttime: associated @bfq_ttime struct

14371

++ * @raising_time_left: snapshot of the time left before weight raising ends

14372

++ *		       for the sync queue associated to this process; this

14373

++ *		       snapshot is taken to remember this value while the weight

14374

++ *		       raising is suspended because the queue is merged with a

14375

++ *		       shared queue, and is used to set @raising_cur_max_time

14376

++ *		       when the queue is split from the shared queue and its

14377

++ *		       weight is raised again

14378

++ * @saved_idle_window: same purpose as the previous field for the idle window

14379

+  */

14380

+ struct bfq_io_cq {

14381

+ 	struct io_cq icq; /* must be the first member */

14382

+ 	struct bfq_queue *bfqq[2];

14383

+ 	struct bfq_ttime ttime;

14384

+ 	int ioprio;

14385

++

14386

++	unsigned int raising_time_left;

14387

++	unsigned int saved_idle_window;

14388

+ };

14389

+

14390

+ /**

14391

+@@ -411,6 +425,7 @@ enum bfqq_state_flags {

14392

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

14393

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

14394

+ 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

14395

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

14396

+ 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

14397

+ };

14398

+

14399

+@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

14400

+ BFQ_BFQQ_FNS(budget_new);

14401

+ BFQ_BFQQ_FNS(coop);

14402

+ BFQ_BFQQ_FNS(split_coop);

14403

++BFQ_BFQQ_FNS(just_split);

14404

+ BFQ_BFQQ_FNS(softrt_update);

14405

+ #undef BFQ_BFQQ_FNS

14406

+

14407

+--

14408

+1.8.5.2

14409

+

14410

14411

Modified: genpatches-2.6/trunk/3.14/0000_README

14412

===================================================================

14413

--- genpatches-2.6/trunk/3.14/0000_README	2014-02-07 14:46:59 UTC (rev 2665)

14414

+++ genpatches-2.6/trunk/3.14/0000_README	2014-02-07 15:42:35 UTC (rev 2666)

14415

@@ -83,17 +83,17 @@

14416

 From:   Tom Wijsman <TomWij@g.o>

14417

 Desc:   Add Gentoo Linux support config settings and defaults.

14418

14419

-Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch

14420

+Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch

14421

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

14422

-Desc:   BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits

14423

+Desc:   BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits

14424

14425

-Patch:  5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1

14426

+Patch:  5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1

14427

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

14428

-Desc:   BFQ v7 patch 2 for 3.13: BFQ Scheduler

14429

+Desc:   BFQ v7r1 patch 2 for 3.13: BFQ Scheduler

14430

14431

-Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch

14432

+Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch

14433

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

14434

-Desc:   BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)

14435

+Desc:   BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)

14436

14437

 Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

14438

 From:   https://github.com/graysky2/kernel_gcc_patch/

14439

14440

Added: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

14441

===================================================================

14442

--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	                        (rev 0)

14443

+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	2014-02-07 15:42:35 UTC (rev 2666)

14444

@@ -0,0 +1,104 @@

14445

+From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001

14446

+From: Paolo Valente <paolo.valente@×××××××.it>

14447

+Date: Tue, 3 Sep 2013 16:50:42 +0200

14448

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13

14449

+

14450

+Update Kconfig.iosched and do the related Makefile changes to include

14451

+kernel configuration options for BFQ. Also add the bfqio controller

14452

+to the cgroups subsystem.

14453

+

14454

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

14455

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

14456

+---

14457

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

14458

+ block/Makefile                |  1 +

14459

+ include/linux/cgroup_subsys.h |  4 ++++

14460

+ 3 files changed, 37 insertions(+)

14461

+

14462

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

14463

+index 421bef9..8f552ba 100644

14464

+--- a/block/Kconfig.iosched

14465

++++ b/block/Kconfig.iosched

14466

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

14467

+ 	---help---

14468

+ 	  Enable group IO scheduling in CFQ.

14469

+

14470

++config IOSCHED_BFQ

14471

++	tristate "BFQ I/O scheduler"

14472

++	default n

14473

++	---help---

14474

++	  The BFQ I/O scheduler tries to distribute bandwidth among

14475

++	  all processes according to their weights.

14476

++	  It aims at distributing the bandwidth as desired, independently of

14477

++	  the disk parameters and with any workload. It also tries to

14478

++	  guarantee low latency to interactive and soft real-time

14479

++	  applications.  If compiled built-in (saying Y here), BFQ can

14480

++	  be configured to support hierarchical scheduling.

14481

++

14482

++config CGROUP_BFQIO

14483

++	bool "BFQ hierarchical scheduling support"

14484

++	depends on CGROUPS && IOSCHED_BFQ=y

14485

++	default n

14486

++	---help---

14487

++	  Enable hierarchical scheduling in BFQ, using the cgroups

14488

++	  filesystem interface.  The name of the subsystem will be

14489

++	  bfqio.

14490

++

14491

+ choice

14492

+ 	prompt "Default I/O scheduler"

14493

+ 	default DEFAULT_CFQ

14494

+@@ -52,6 +73,16 @@ choice

14495

+ 	config DEFAULT_CFQ

14496

+ 		bool "CFQ" if IOSCHED_CFQ=y

14497

+

14498

++	config DEFAULT_BFQ

14499

++		bool "BFQ" if IOSCHED_BFQ=y

14500

++		help

14501

++		  Selects BFQ as the default I/O scheduler which will be

14502

++		  used by default for all block devices.

14503

++		  The BFQ I/O scheduler aims at distributing the bandwidth

14504

++		  as desired, independently of the disk parameters and with

14505

++		  any workload. It also tries to guarantee low latency to

14506

++		  interactive and soft real-time applications.

14507

++

14508

+ 	config DEFAULT_NOOP

14509

+ 		bool "No-op"

14510

+

14511

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

14512

+ 	string

14513

+ 	default "deadline" if DEFAULT_DEADLINE

14514

+ 	default "cfq" if DEFAULT_CFQ

14515

++	default "bfq" if DEFAULT_BFQ

14516

+ 	default "noop" if DEFAULT_NOOP

14517

+

14518

+ endmenu

14519

+diff --git a/block/Makefile b/block/Makefile

14520

+index 20645e8..cbd83fb 100644

14521

+--- a/block/Makefile

14522

++++ b/block/Makefile

14523

+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

14524

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

14525

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

14526

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

14527

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

14528

+

14529

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

14530

+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

14531

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

14532

+index b613ffd..43c5dc9 100644

14533

+--- a/include/linux/cgroup_subsys.h

14534

++++ b/include/linux/cgroup_subsys.h

14535

+@@ -39,6 +39,10 @@ SUBSYS(net_cls)

14536

+ SUBSYS(blkio)

14537

+ #endif

14538

+

14539

++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

14540

++SUBSYS(bfqio)

14541

++#endif

14542

++

14543

+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

14544

+ SUBSYS(perf)

14545

+ #endif

14546

+--

14547

+1.8.5.2

14548

+

14549

14550

Added: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

14551

===================================================================

14552

--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	                        (rev 0)

14553

+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	2014-02-07 15:42:35 UTC (rev 2666)

14554

@@ -0,0 +1,6040 @@

14555

+From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001

14556

+From: Paolo Valente <paolo.valente@×××××××.it>

14557

+Date: Thu, 9 May 2013 19:10:02 +0200

14558

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13

14559

+

14560

+Add the BFQ-v7r1 I/O scheduler to 3.13.

14561

+The general structure is borrowed from CFQ, as much of the code for

14562

+handling I/O contexts Over time, several useful features have been

14563

+ported from CFQ as well (details in the changelog in README.BFQ). A

14564

+(bfq_)queue is associated to each task doing I/O on a device, and each

14565

+time a scheduling decision has to be made a queue is selected and served

14566

+until it expires.

14567

+

14568

+    - Slices are given in the service domain: tasks are assigned

14569

+      budgets, measured in number of sectors. Once got the disk, a task

14570

+      must however consume its assigned budget within a configurable

14571

+      maximum time (by default, the maximum possible value of the

14572

+      budgets is automatically computed to comply with this timeout).

14573

+      This allows the desired latency vs "throughput boosting" tradeoff

14574

+      to be set.

14575

+

14576

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

14577

+      using an augmented rb-tree to take eligibility into account while

14578

+      preserving an O(log N) overall complexity.

14579

+

14580

+    - A low-latency tunable is provided; if enabled, both interactive

14581

+      and soft real-time applications are guaranteed a very low latency.

14582

+

14583

+    - Latency guarantees are preserved also in the presence of NCQ.

14584

+

14585

+    - Also with flash-based devices, a high throughput is achieved

14586

+      while still preserving latency guarantees.

14587

+

14588

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

14589

+      cooperating-queue-merging and the preemption mechanisms present

14590

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

14591

+      sequential read pattern, and hence a high throughput, with any

14592

+      set of processes performing interleaved I/O over a contiguous

14593

+      sequence of sectors.

14594

+

14595

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

14596

+      interface.  Since each node has a full scheduler, each group can

14597

+      be assigned its own weight.

14598

+

14599

+    - If the cgroups interface is not used, only I/O priorities can be

14600

+      assigned to processes, with ioprio values mapped to weights

14601

+      with the relation weight = IOPRIO_BE_NR - ioprio.

14602

+

14603

+    - ioprio classes are served in strict priority order, i.e., lower

14604

+      priority queues are not served as long as there are higher

14605

+      priority queues.  Among queues in the same class the bandwidth is

14606

+      distributed in proportion to the weight of each queue. A very

14607

+      thin extra bandwidth is however guaranteed to the Idle class, to

14608

+      prevent it from starving.

14609

+

14610

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

14611

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

14612

+---

14613

+ block/bfq-cgroup.c  |  911 ++++++++++++++

14614

+ block/bfq-ioc.c     |   36 +

14615

+ block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++

14616

+ block/bfq-sched.c   | 1078 +++++++++++++++++

14617

+ block/bfq.h         |  614 ++++++++++

14618

+ 5 files changed, 5937 insertions(+)

14619

+ create mode 100644 block/bfq-cgroup.c

14620

+ create mode 100644 block/bfq-ioc.c

14621

+ create mode 100644 block/bfq-iosched.c

14622

+ create mode 100644 block/bfq-sched.c

14623

+ create mode 100644 block/bfq.h

14624

+

14625

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

14626

+new file mode 100644

14627

+index 0000000..79a288a

14628

+--- /dev/null

14629

++++ b/block/bfq-cgroup.c

14630

+@@ -0,0 +1,911 @@

14631

++/*

14632

++ * BFQ: CGROUPS support.

14633

++ *

14634

++ * Based on ideas and code from CFQ:

14635

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

14636

++ *

14637

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

14638

++ *		      Paolo Valente <paolo.valente@×××××××.it>

14639

++ *

14640

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

14641

++ *

14642

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

14643

++ */

14644

++

14645

++#ifdef CONFIG_CGROUP_BFQIO

14646

++

14647

++static DEFINE_MUTEX(bfqio_mutex);

14648

++

14649

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

14650

++{

14651

++	return bgrp ? !bgrp->online : false;

14652

++}

14653

++

14654

++static struct bfqio_cgroup bfqio_root_cgroup = {

14655

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

14656

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

14657

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

14658

++};

14659

++

14660

++static inline void bfq_init_entity(struct bfq_entity *entity,

14661

++				   struct bfq_group *bfqg)

14662

++{

14663

++	entity->weight = entity->new_weight;

14664

++	entity->orig_weight = entity->new_weight;

14665

++	entity->ioprio = entity->new_ioprio;

14666

++	entity->ioprio_class = entity->new_ioprio_class;

14667

++	entity->parent = bfqg->my_entity;

14668

++	entity->sched_data = &bfqg->sched_data;

14669

++}

14670

++

14671

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

14672

++{

14673

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

14674

++}

14675

++

14676

++/*

14677

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

14678

++ * of bgrp.  Must be called under rcu_read_lock().

14679

++ */

14680

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

14681

++					    struct bfq_data *bfqd)

14682

++{

14683

++	struct bfq_group *bfqg;

14684

++	void *key;

14685

++

14686

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

14687

++		key = rcu_dereference(bfqg->bfqd);

14688

++		if (key == bfqd)

14689

++			return bfqg;

14690

++	}

14691

++

14692

++	return NULL;

14693

++}

14694

++

14695

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

14696

++					 struct bfq_group *bfqg)

14697

++{

14698

++	struct bfq_entity *entity = &bfqg->entity;

14699

++

14700

++	/*

14701

++	 * If the weight of the entity has never been set via the sysfs

14702

++	 * interface, then bgrp->weight == 0. In this case we initialize

14703

++	 * the weight from the current ioprio value. Otherwise, the group

14704

++	 * weight, if set, has priority over the ioprio value.

14705

++	 */

14706

++	if (bgrp->weight == 0) {

14707

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

14708

++		entity->new_ioprio = bgrp->ioprio;

14709

++	} else {

14710

++		entity->new_weight = bgrp->weight;

14711

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

14712

++	}

14713

++	entity->orig_weight = entity->weight = entity->new_weight;

14714

++	entity->ioprio = entity->new_ioprio;

14715

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

14716

++	entity->my_sched_data = &bfqg->sched_data;

14717

++}

14718

++

14719

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

14720

++					struct bfq_group *parent)

14721

++{

14722

++	struct bfq_entity *entity;

14723

++

14724

++	BUG_ON(parent == NULL);

14725

++	BUG_ON(bfqg == NULL);

14726

++

14727

++	entity = &bfqg->entity;

14728

++	entity->parent = parent->my_entity;

14729

++	entity->sched_data = &parent->sched_data;

14730

++}

14731

++

14732

++/**

14733

++ * bfq_group_chain_alloc - allocate a chain of groups.

14734

++ * @bfqd: queue descriptor.

14735

++ * @css: the leaf cgroup_subsys_state this chain starts from.

14736

++ *

14737

++ * Allocate a chain of groups starting from the one belonging to

14738

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

14739

++ * to the root has already an allocated group on @bfqd.

14740

++ */

14741

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

14742

++					       struct cgroup_subsys_state *css)

14743

++{

14744

++	struct bfqio_cgroup *bgrp;

14745

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

14746

++

14747

++	for (; css != NULL; css = css->parent) {

14748

++		bgrp = css_to_bfqio(css);

14749

++

14750

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

14751

++		if (bfqg != NULL) {

14752

++			/*

14753

++			 * All the cgroups in the path from there to the

14754

++			 * root must have a bfq_group for bfqd, so we don't

14755

++			 * need any more allocations.

14756

++			 */

14757

++			break;

14758

++		}

14759

++

14760

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

14761

++		if (bfqg == NULL)

14762

++			goto cleanup;

14763

++

14764

++		bfq_group_init_entity(bgrp, bfqg);

14765

++		bfqg->my_entity = &bfqg->entity;

14766

++

14767

++		if (leaf == NULL) {

14768

++			leaf = bfqg;

14769

++			prev = leaf;

14770

++		} else {

14771

++			bfq_group_set_parent(prev, bfqg);

14772

++			/*

14773

++			 * Build a list of allocated nodes using the bfqd

14774

++			 * filed, that is still unused and will be initialized

14775

++			 * only after the node will be connected.

14776

++			 */

14777

++			prev->bfqd = bfqg;

14778

++			prev = bfqg;

14779

++		}

14780

++	}

14781

++

14782

++	return leaf;

14783

++

14784

++cleanup:

14785

++	while (leaf != NULL) {

14786

++		prev = leaf;

14787

++		leaf = leaf->bfqd;

14788

++		kfree(prev);

14789

++	}

14790

++

14791

++	return NULL;

14792

++}

14793

++

14794

++/**

14795

++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

14796

++ * @bfqd: the queue descriptor.

14797

++ * @css: the leaf cgroup_subsys_state to start from.

14798

++ * @leaf: the leaf group (to be associated to @cgroup).

14799

++ *

14800

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

14801

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

14802

++ * hierarchy that already as a group associated to @bfqd all the nodes

14803

++ * in the path to the root cgroup have one too.

14804

++ *

14805

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

14806

++ * per device) while the bfqio_cgroup lock protects the list of groups

14807

++ * belonging to the same cgroup.

14808

++ */

14809

++static void bfq_group_chain_link(struct bfq_data *bfqd,

14810

++				 struct cgroup_subsys_state *css,

14811

++				 struct bfq_group *leaf)

14812

++{

14813

++	struct bfqio_cgroup *bgrp;

14814

++	struct bfq_group *bfqg, *next, *prev = NULL;

14815

++	unsigned long flags;

14816

++

14817

++	assert_spin_locked(bfqd->queue->queue_lock);

14818

++

14819

++	for (; css != NULL && leaf != NULL; css = css->parent) {

14820

++		bgrp = css_to_bfqio(css);

14821

++		next = leaf->bfqd;

14822

++

14823

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

14824

++		BUG_ON(bfqg != NULL);

14825

++

14826

++		spin_lock_irqsave(&bgrp->lock, flags);

14827

++

14828

++		rcu_assign_pointer(leaf->bfqd, bfqd);

14829

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

14830

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

14831

++

14832

++		spin_unlock_irqrestore(&bgrp->lock, flags);

14833

++

14834

++		prev = leaf;

14835

++		leaf = next;

14836

++	}

14837

++

14838

++	BUG_ON(css == NULL && leaf != NULL);

14839

++	if (css != NULL && prev != NULL) {

14840

++		bgrp = css_to_bfqio(css);

14841

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

14842

++		bfq_group_set_parent(prev, bfqg);

14843

++	}

14844

++}

14845

++

14846

++/**

14847

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

14848

++ * @bfqd: queue descriptor.

14849

++ * @cgroup: cgroup being searched for.

14850

++ *

14851

++ * Return a group associated to @bfqd in @cgroup, allocating one if

14852

++ * necessary.  When a group is returned all the cgroups in the path

14853

++ * to the root have a group associated to @bfqd.

14854

++ *

14855

++ * If the allocation fails, return the root group: this breaks guarantees

14856

++ * but is a safe fallbak.  If this loss becames a problem it can be

14857

++ * mitigated using the equivalent weight (given by the product of the

14858

++ * weights of the groups in the path from @group to the root) in the

14859

++ * root scheduler.

14860

++ *

14861

++ * We allocate all the missing nodes in the path from the leaf cgroup

14862

++ * to the root and we connect the nodes only after all the allocations

14863

++ * have been successful.

14864

++ */

14865

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

14866

++					      struct cgroup_subsys_state *css)

14867

++{

14868

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14869

++	struct bfq_group *bfqg;

14870

++

14871

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

14872

++	if (bfqg != NULL)

14873

++		return bfqg;

14874

++

14875

++	bfqg = bfq_group_chain_alloc(bfqd, css);

14876

++	if (bfqg != NULL)

14877

++		bfq_group_chain_link(bfqd, css, bfqg);

14878

++	else

14879

++		bfqg = bfqd->root_group;

14880

++

14881

++	return bfqg;

14882

++}

14883

++

14884

++/**

14885

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

14886

++ * @bfqd: queue descriptor.

14887

++ * @bfqq: the queue to move.

14888

++ * @entity: @bfqq's entity.

14889

++ * @bfqg: the group to move to.

14890

++ *

14891

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

14892

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

14893

++ *

14894

++ * Must be called under the queue lock; the cgroup owning @bfqg must

14895

++ * not disappear (by now this just means that we are called under

14896

++ * rcu_read_lock()).

14897

++ */

14898

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14899

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

14900

++{

14901

++	int busy, resume;

14902

++

14903

++	busy = bfq_bfqq_busy(bfqq);

14904

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

14905

++

14906

++	BUG_ON(resume && !entity->on_st);

14907

++	BUG_ON(busy && !resume && entity->on_st &&

14908

++	       bfqq != bfqd->in_service_queue);

14909

++

14910

++	if (busy) {

14911

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

14912

++

14913

++		if (!resume)

14914

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

14915

++		else

14916

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

14917

++	} else if (entity->on_st)

14918

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

14919

++

14920

++	/*

14921

++	 * Here we use a reference to bfqg.  We don't need a refcounter

14922

++	 * as the cgroup reference will not be dropped, so that its

14923

++	 * destroy() callback will not be invoked.

14924

++	 */

14925

++	entity->parent = bfqg->my_entity;

14926

++	entity->sched_data = &bfqg->sched_data;

14927

++

14928

++	if (busy && resume)

14929

++		bfq_activate_bfqq(bfqd, bfqq);

14930

++

14931

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

14932

++		bfq_schedule_dispatch(bfqd);

14933

++}

14934

++

14935

++/**

14936

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

14937

++ * @bfqd: the queue descriptor.

14938

++ * @bic: the bic to move.

14939

++ * @cgroup: the cgroup to move to.

14940

++ *

14941

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

14942

++ * has to make sure that the reference to cgroup is valid across the call.

14943

++ *

14944

++ * NOTE: an alternative approach might have been to store the current

14945

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

14946

++ * time here, at the price of slightly more complex code.

14947

++ */

14948

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14949

++						struct bfq_io_cq *bic,

14950

++						struct cgroup_subsys_state *css)

14951

++{

14952

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

14953

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

14954

++	struct bfq_entity *entity;

14955

++	struct bfq_group *bfqg;

14956

++	struct bfqio_cgroup *bgrp;

14957

++

14958

++	bgrp = css_to_bfqio(css);

14959

++

14960

++	bfqg = bfq_find_alloc_group(bfqd, css);

14961

++	if (async_bfqq != NULL) {

14962

++		entity = &async_bfqq->entity;

14963

++

14964

++		if (entity->sched_data != &bfqg->sched_data) {

14965

++			bic_set_bfqq(bic, NULL, 0);

14966

++			bfq_log_bfqq(bfqd, async_bfqq,

14967

++				     "bic_change_group: %p %d",

14968

++				     async_bfqq, atomic_read(&async_bfqq->ref));

14969

++			bfq_put_queue(async_bfqq);

14970

++		}

14971

++	}

14972

++

14973

++	if (sync_bfqq != NULL) {

14974

++		entity = &sync_bfqq->entity;

14975

++		if (entity->sched_data != &bfqg->sched_data)

14976

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

14977

++	}

14978

++

14979

++	return bfqg;

14980

++}

14981

++

14982

++/**

14983

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

14984

++ * @bic: the bic being migrated.

14985

++ * @cgroup: the destination cgroup.

14986

++ *

14987

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

14988

++ * moved into its new parent group.

14989

++ */

14990

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

14991

++				  struct cgroup_subsys_state *css)

14992

++{

14993

++	struct bfq_data *bfqd;

14994

++	unsigned long uninitialized_var(flags);

14995

++

14996

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

14997

++				   &flags);

14998

++	if (bfqd != NULL) {

14999

++		__bfq_bic_change_cgroup(bfqd, bic, css);

15000

++		bfq_put_bfqd_unlock(bfqd, &flags);

15001

++	}

15002

++}

15003

++

15004

++/**

15005

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

15006

++ * @bic: the @bic to update.

15007

++ *

15008

++ * Make sure that @bic is enqueued in the cgroup of the current task.

15009

++ * We need this in addition to moving bics during the cgroup attach

15010

++ * phase because the task owning @bic could be at its first disk

15011

++ * access or we may end up in the root cgroup as the result of a

15012

++ * memory allocation failure and here we try to move to the right

15013

++ * group.

15014

++ *

15015

++ * Must be called under the queue lock.  It is safe to use the returned

15016

++ * value even after the rcu_read_unlock() as the migration/destruction

15017

++ * paths act under the queue lock too.  IOW it is impossible to race with

15018

++ * group migration/destruction and end up with an invalid group as:

15019

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

15020

++ *      has started execution, as current holds a reference to it,

15021

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

15022

++ *      migrated to a different cgroup] its attach() callback will have

15023

++ *      taken care of remove all the references to the old cgroup data.

15024

++ */

15025

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

15026

++{

15027

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

15028

++	struct bfq_group *bfqg;

15029

++	struct cgroup_subsys_state *css;

15030

++

15031

++	BUG_ON(bfqd == NULL);

15032

++

15033

++	rcu_read_lock();

15034

++	css = task_css(current, bfqio_subsys_id);

15035

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

15036

++	rcu_read_unlock();

15037

++

15038

++	return bfqg;

15039

++}

15040

++

15041

++/**

15042

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

15043

++ * @st: the service tree being flushed.

15044

++ */

15045

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

15046

++{

15047

++	struct bfq_entity *entity = st->first_idle;

15048

++

15049

++	for (; entity != NULL; entity = st->first_idle)

15050

++		__bfq_deactivate_entity(entity, 0);

15051

++}

15052

++

15053

++/**

15054

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

15055

++ * @bfqd: the device data structure with the root group.

15056

++ * @entity: the entity to move.

15057

++ */

15058

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

15059

++					    struct bfq_entity *entity)

15060

++{

15061

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

15062

++

15063

++	BUG_ON(bfqq == NULL);

15064

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

15065

++	return;

15066

++}

15067

++

15068

++/**

15069

++ * bfq_reparent_active_entities - move to the root group all active entities.

15070

++ * @bfqd: the device data structure with the root group.

15071

++ * @bfqg: the group to move from.

15072

++ * @st: the service tree with the entities.

15073

++ *

15074

++ * Needs queue_lock to be taken and reference to be valid over the call.

15075

++ */

15076

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

15077

++						struct bfq_group *bfqg,

15078

++						struct bfq_service_tree *st)

15079

++{

15080

++	struct rb_root *active = &st->active;

15081

++	struct bfq_entity *entity = NULL;

15082

++

15083

++	if (!RB_EMPTY_ROOT(&st->active))

15084

++		entity = bfq_entity_of(rb_first(active));

15085

++

15086

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

15087

++		bfq_reparent_leaf_entity(bfqd, entity);

15088

++

15089

++	if (bfqg->sched_data.in_service_entity != NULL)

15090

++		bfq_reparent_leaf_entity(bfqd,

15091

++			bfqg->sched_data.in_service_entity);

15092

++

15093

++	return;

15094

++}

15095

++

15096

++/**

15097

++ * bfq_destroy_group - destroy @bfqg.

15098

++ * @bgrp: the bfqio_cgroup containing @bfqg.

15099

++ * @bfqg: the group being destroyed.

15100

++ *

15101

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

15102

++ */

15103

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

15104

++{

15105

++	struct bfq_data *bfqd;

15106

++	struct bfq_service_tree *st;

15107

++	struct bfq_entity *entity = bfqg->my_entity;

15108

++	unsigned long uninitialized_var(flags);

15109

++	int i;

15110

++

15111

++	hlist_del(&bfqg->group_node);

15112

++

15113

++	/*

15114

++	 * Empty all service_trees belonging to this group before deactivating

15115

++	 * the group itself.

15116

++	 */

15117

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

15118

++		st = bfqg->sched_data.service_tree + i;

15119

++

15120

++		/*

15121

++		 * The idle tree may still contain bfq_queues belonging

15122

++		 * to exited task because they never migrated to a different

15123

++		 * cgroup from the one being destroyed now.  Noone else

15124

++		 * can access them so it's safe to act without any lock.

15125

++		 */

15126

++		bfq_flush_idle_tree(st);

15127

++

15128

++		/*

15129

++		 * It may happen that some queues are still active

15130

++		 * (busy) upon group destruction (if the corresponding

15131

++		 * processes have been forced to terminate). We move

15132

++		 * all the leaf entities corresponding to these queues

15133

++		 * to the root_group.

15134

++		 * Also, it may happen that the group has an entity

15135

++		 * under service, which is disconnected from the active

15136

++		 * tree: it must be moved, too.

15137

++		 * There is no need to put the sync queues, as the

15138

++		 * scheduler has taken no reference.

15139

++		 */

15140

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

15141

++		if (bfqd != NULL) {

15142

++			bfq_reparent_active_entities(bfqd, bfqg, st);

15143

++			bfq_put_bfqd_unlock(bfqd, &flags);

15144

++		}

15145

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

15146

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

15147

++	}

15148

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

15149

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

15150

++

15151

++	/*

15152

++	 * We may race with device destruction, take extra care when

15153

++	 * dereferencing bfqg->bfqd.

15154

++	 */

15155

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

15156

++	if (bfqd != NULL) {

15157

++		hlist_del(&bfqg->bfqd_node);

15158

++		__bfq_deactivate_entity(entity, 0);

15159

++		bfq_put_async_queues(bfqd, bfqg);

15160

++		bfq_put_bfqd_unlock(bfqd, &flags);

15161

++	}

15162

++	BUG_ON(entity->tree != NULL);

15163

++

15164

++	/*

15165

++	 * No need to defer the kfree() to the end of the RCU grace

15166

++	 * period: we are called from the destroy() callback of our

15167

++	 * cgroup, so we can be sure that noone is a) still using

15168

++	 * this cgroup or b) doing lookups in it.

15169

++	 */

15170

++	kfree(bfqg);

15171

++}

15172

++

15173

++static void bfq_end_raising_async(struct bfq_data *bfqd)

15174

++{

15175

++	struct hlist_node *tmp;

15176

++	struct bfq_group *bfqg;

15177

++

15178

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

15179

++		bfq_end_raising_async_queues(bfqd, bfqg);

15180

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

15181

++}

15182

++

15183

++/**

15184

++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

15185

++ * @bfqd: the device descriptor being exited.

15186

++ *

15187

++ * When the device exits we just make sure that no lookup can return

15188

++ * the now unused group structures.  They will be deallocated on cgroup

15189

++ * destruction.

15190

++ */

15191

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

15192

++{

15193

++	struct hlist_node *tmp;

15194

++	struct bfq_group *bfqg;

15195

++

15196

++	bfq_log(bfqd, "disconnect_groups beginning");

15197

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

15198

++		hlist_del(&bfqg->bfqd_node);

15199

++

15200

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

15201

++

15202

++		/*

15203

++		 * Don't remove from the group hash, just set an

15204

++		 * invalid key.  No lookups can race with the

15205

++		 * assignment as bfqd is being destroyed; this

15206

++		 * implies also that new elements cannot be added

15207

++		 * to the list.

15208

++		 */

15209

++		rcu_assign_pointer(bfqg->bfqd, NULL);

15210

++

15211

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

15212

++			bfqg);

15213

++		bfq_put_async_queues(bfqd, bfqg);

15214

++	}

15215

++}

15216

++

15217

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

15218

++{

15219

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

15220

++	struct bfq_group *bfqg = bfqd->root_group;

15221

++

15222

++	bfq_put_async_queues(bfqd, bfqg);

15223

++

15224

++	spin_lock_irq(&bgrp->lock);

15225

++	hlist_del_rcu(&bfqg->group_node);

15226

++	spin_unlock_irq(&bgrp->lock);

15227

++

15228

++	/*

15229

++	 * No need to synchronize_rcu() here: since the device is gone

15230

++	 * there cannot be any read-side access to its root_group.

15231

++	 */

15232

++	kfree(bfqg);

15233

++}

15234

++

15235

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

15236

++{

15237

++	struct bfq_group *bfqg;

15238

++	struct bfqio_cgroup *bgrp;

15239

++	int i;

15240

++

15241

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

15242

++	if (bfqg == NULL)

15243

++		return NULL;

15244

++

15245

++	bfqg->entity.parent = NULL;

15246

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

15247

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

15248

++

15249

++	bgrp = &bfqio_root_cgroup;

15250

++	spin_lock_irq(&bgrp->lock);

15251

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

15252

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

15253

++	spin_unlock_irq(&bgrp->lock);

15254

++

15255

++	return bfqg;

15256

++}

15257

++

15258

++#define SHOW_FUNCTION(__VAR)						\

15259

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

15260

++				       struct cftype *cftype)		\

15261

++{									\

15262

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

15263

++	u64 ret = -ENODEV;						\

15264

++									\

15265

++	mutex_lock(&bfqio_mutex);					\

15266

++	if (bfqio_is_removed(bgrp))					\

15267

++		goto out_unlock;					\

15268

++									\

15269

++	spin_lock_irq(&bgrp->lock);					\

15270

++	ret = bgrp->__VAR;						\

15271

++	spin_unlock_irq(&bgrp->lock);					\

15272

++									\

15273

++out_unlock:								\

15274

++	mutex_unlock(&bfqio_mutex);					\

15275

++	return ret;							\

15276

++}

15277

++

15278

++SHOW_FUNCTION(weight);

15279

++SHOW_FUNCTION(ioprio);

15280

++SHOW_FUNCTION(ioprio_class);

15281

++#undef SHOW_FUNCTION

15282

++

15283

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

15284

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

15285

++					struct cftype *cftype,		\

15286

++					u64 val)			\

15287

++{									\

15288

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

15289

++	struct bfq_group *bfqg;						\

15290

++	int ret = -EINVAL;						\

15291

++									\

15292

++	if (val < (__MIN) || val > (__MAX))				\

15293

++		return ret;						\

15294

++									\

15295

++	ret = -ENODEV;							\

15296

++	mutex_lock(&bfqio_mutex);					\

15297

++	if (bfqio_is_removed(bgrp))					\

15298

++		goto out_unlock;					\

15299

++	ret = 0;							\

15300

++									\

15301

++	spin_lock_irq(&bgrp->lock);					\

15302

++	bgrp->__VAR = (unsigned short)val;				\

15303

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

15304

++		/*							\

15305

++		 * Setting the ioprio_changed flag of the entity        \

15306

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

15307

++		 * the value of the weight to its ioprio mapping.       \

15308

++		 * Set the flag only if necessary.                      \

15309

++		 */                                                     \

15310

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

15311

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

15312

++			smp_wmb();                                      \

15313

++			bfqg->entity.ioprio_changed = 1;                \

15314

++		}							\

15315

++	}								\

15316

++	spin_unlock_irq(&bgrp->lock);					\

15317

++									\

15318

++out_unlock:								\

15319

++	mutex_unlock(&bfqio_mutex);					\

15320

++	return ret;							\

15321

++}

15322

++

15323

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

15324

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

15325

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

15326

++#undef STORE_FUNCTION

15327

++

15328

++static struct cftype bfqio_files[] = {

15329

++	{

15330

++		.name = "weight",

15331

++		.read_u64 = bfqio_cgroup_weight_read,

15332

++		.write_u64 = bfqio_cgroup_weight_write,

15333

++	},

15334

++	{

15335

++		.name = "ioprio",

15336

++		.read_u64 = bfqio_cgroup_ioprio_read,

15337

++		.write_u64 = bfqio_cgroup_ioprio_write,

15338

++	},

15339

++	{

15340

++		.name = "ioprio_class",

15341

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

15342

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

15343

++	},

15344

++	{ },	/* terminate */

15345

++};

15346

++

15347

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

15348

++						*parent_css)

15349

++{

15350

++	struct bfqio_cgroup *bgrp;

15351

++

15352

++	if (parent_css != NULL) {

15353

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

15354

++		if (bgrp == NULL)

15355

++			return ERR_PTR(-ENOMEM);

15356

++	} else

15357

++		bgrp = &bfqio_root_cgroup;

15358

++

15359

++	spin_lock_init(&bgrp->lock);

15360

++	INIT_HLIST_HEAD(&bgrp->group_data);

15361

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

15362

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

15363

++

15364

++	return &bgrp->css;

15365

++}

15366

++

15367

++/*

15368

++ * We cannot support shared io contexts, as we have no means to support

15369

++ * two tasks with the same ioc in two different groups without major rework

15370

++ * of the main bic/bfqq data structures.  By now we allow a task to change

15371

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

15372

++ * behavior is that a group containing a task that forked using CLONE_IO

15373

++ * will not be destroyed until the tasks sharing the ioc die.

15374

++ */

15375

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

15376

++			    struct cgroup_taskset *tset)

15377

++{

15378

++	struct task_struct *task;

15379

++	struct io_context *ioc;

15380

++	int ret = 0;

15381

++

15382

++	cgroup_taskset_for_each(task, css, tset) {

15383

++		/*

15384

++		 * task_lock() is needed to avoid races with

15385

++		 * exit_io_context()

15386

++		 */

15387

++		task_lock(task);

15388

++		ioc = task->io_context;

15389

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

15390

++			/*

15391

++			 * ioc == NULL means that the task is either too young

15392

++			 * or exiting: if it has still no ioc the ioc can't be

15393

++			 * shared, if the task is exiting the attach will fail

15394

++			 * anyway, no matter what we return here.

15395

++			 */

15396

++			ret = -EINVAL;

15397

++		task_unlock(task);

15398

++		if (ret)

15399

++			break;

15400

++	}

15401

++

15402

++	return ret;

15403

++}

15404

++

15405

++static void bfqio_attach(struct cgroup_subsys_state *css,

15406

++			 struct cgroup_taskset *tset)

15407

++{

15408

++	struct task_struct *task;

15409

++	struct io_context *ioc;

15410

++	struct io_cq *icq;

15411

++

15412

++	/*

15413

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

15414

++	 * new group has not yet been tested.

15415

++	 */

15416

++	cgroup_taskset_for_each(task, css, tset) {

15417

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

15418

++		if (ioc) {

15419

++			/*

15420

++			 * Handle cgroup change here.

15421

++			 */

15422

++			rcu_read_lock();

15423

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

15424

++				if (!strncmp(

15425

++					icq->q->elevator->type->elevator_name,

15426

++					"bfq", ELV_NAME_MAX))

15427

++					bfq_bic_change_cgroup(icq_to_bic(icq),

15428

++							      css);

15429

++			rcu_read_unlock();

15430

++			put_io_context(ioc);

15431

++		}

15432

++	}

15433

++}

15434

++

15435

++static void bfqio_destroy(struct cgroup_subsys_state *css)

15436

++{

15437

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

15438

++	struct hlist_node *tmp;

15439

++	struct bfq_group *bfqg;

15440

++

15441

++	/*

15442

++	 * Since we are destroying the cgroup, there are no more tasks

15443

++	 * referencing it, and all the RCU grace periods that may have

15444

++	 * referenced it are ended (as the destruction of the parent

15445

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

15446

++	 * anything else and we don't need any synchronization.

15447

++	 */

15448

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

15449

++		bfq_destroy_group(bgrp, bfqg);

15450

++

15451

++	BUG_ON(!hlist_empty(&bgrp->group_data));

15452

++

15453

++	kfree(bgrp);

15454

++}

15455

++

15456

++static int bfqio_css_online(struct cgroup_subsys_state *css)

15457

++{

15458

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

15459

++

15460

++	mutex_lock(&bfqio_mutex);

15461

++	bgrp->online = true;

15462

++	mutex_unlock(&bfqio_mutex);

15463

++

15464

++	return 0;

15465

++}

15466

++

15467

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

15468

++{

15469

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

15470

++

15471

++	mutex_lock(&bfqio_mutex);

15472

++	bgrp->online = false;

15473

++	mutex_unlock(&bfqio_mutex);

15474

++}

15475

++

15476

++struct cgroup_subsys bfqio_subsys = {

15477

++	.name = "bfqio",

15478

++	.css_alloc = bfqio_create,

15479

++	.css_online = bfqio_css_online,

15480

++	.css_offline = bfqio_css_offline,

15481

++	.can_attach = bfqio_can_attach,

15482

++	.attach = bfqio_attach,

15483

++	.css_free = bfqio_destroy,

15484

++	.subsys_id = bfqio_subsys_id,

15485

++	.base_cftypes = bfqio_files,

15486

++};

15487

++#else

15488

++static inline void bfq_init_entity(struct bfq_entity *entity,

15489

++				   struct bfq_group *bfqg)

15490

++{

15491

++	entity->weight = entity->new_weight;

15492

++	entity->orig_weight = entity->new_weight;

15493

++	entity->ioprio = entity->new_ioprio;

15494

++	entity->ioprio_class = entity->new_ioprio_class;

15495

++	entity->sched_data = &bfqg->sched_data;

15496

++}

15497

++

15498

++static inline struct bfq_group *

15499

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

15500

++{

15501

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

15502

++	return bfqd->root_group;

15503

++}

15504

++

15505

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

15506

++				 struct bfq_queue *bfqq,

15507

++				 struct bfq_entity *entity,

15508

++				 struct bfq_group *bfqg)

15509

++{

15510

++}

15511

++

15512

++static void bfq_end_raising_async(struct bfq_data *bfqd)

15513

++{

15514

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

15515

++}

15516

++

15517

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

15518

++{

15519

++	bfq_put_async_queues(bfqd, bfqd->root_group);

15520

++}

15521

++

15522

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

15523

++{

15524

++	kfree(bfqd->root_group);

15525

++}

15526

++

15527

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

15528

++{

15529

++	struct bfq_group *bfqg;

15530

++	int i;

15531

++

15532

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

15533

++	if (bfqg == NULL)

15534

++		return NULL;

15535

++

15536

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

15537

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

15538

++

15539

++	return bfqg;

15540

++}

15541

++#endif

15542

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

15543

+new file mode 100644

15544

+index 0000000..7f6b000

15545

+--- /dev/null

15546

++++ b/block/bfq-ioc.c

15547

+@@ -0,0 +1,36 @@

15548

++/*

15549

++ * BFQ: I/O context handling.

15550

++ *

15551

++ * Based on ideas and code from CFQ:

15552

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

15553

++ *

15554

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

15555

++ *		      Paolo Valente <paolo.valente@×××××××.it>

15556

++ *

15557

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

15558

++ */

15559

++

15560

++/**

15561

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

15562

++ * @icq: the iocontext queue.

15563

++ */

15564

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

15565

++{

15566

++	/* bic->icq is the first member, %NULL will convert to %NULL */

15567

++	return container_of(icq, struct bfq_io_cq, icq);

15568

++}

15569

++

15570

++/**

15571

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

15572

++ * @bfqd: the lookup key.

15573

++ * @ioc: the io_context of the process doing I/O.

15574

++ *

15575

++ * Queue lock must be held.

15576

++ */

15577

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

15578

++					       struct io_context *ioc)

15579

++{

15580

++	if (ioc)

15581

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

15582

++	return NULL;

15583

++}

15584

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

15585

+new file mode 100644

15586

+index 0000000..eb760de

15587

+--- /dev/null

15588

++++ b/block/bfq-iosched.c

15589

+@@ -0,0 +1,3298 @@

15590

++/*

15591

++ * BFQ, or Budget Fair Queueing, disk scheduler.

15592

++ *

15593

++ * Based on ideas and code from CFQ:

15594

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

15595

++ *

15596

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

15597

++ *		      Paolo Valente <paolo.valente@×××××××.it>

15598

++ *

15599

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

15600

++ *

15601

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

15602

++ *

15603

++ * BFQ is a proportional share disk scheduling algorithm based on the

15604

++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

15605

++ * number of sectors, to tasks instead of time slices. The disk is not granted

15606

++ * to the in-service task for a given time slice, but until it has exahusted

15607

++ * its assigned budget. This change from the time to the service domain allows

15608

++ * BFQ to distribute the disk bandwidth among tasks as desired, without any

15609

++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

15610

++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

15611

++ * their budgets (more precisely BFQ schedules queues associated to tasks).

15612

++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

15613

++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

15614

++ * latencies to interactive and soft real-time applications.

15615

++ *

15616

++ * BFQ is described in [1], where also a reference to the initial, more

15617

++ * theoretical paper on BFQ can be found. The interested reader can find in

15618

++ * the latter paper full details on the main algorithm as well as formulas of

15619

++ * the guarantees, plus formal proofs of all the properties. With respect to

15620

++ * the version of BFQ presented in these papers, this implementation adds a

15621

++ * few more heuristics, such as the one that guarantees a low latency to soft

15622

++ * real-time applications, and a hierarchical extension based on H-WF2Q+.

15623

++ *

15624

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

15625

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

15626

++ * complexity derives from the one introduced with EEVDF in [3].

15627

++ *

15628

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

15629

++ *     with the BFQ Disk I/O Scheduler'',

15630

++ *     Proceedings of the 5th Annual International Systems and Storage

15631

++ *     Conference (SYSTOR '12), June 2012.

15632

++ *

15633

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

15634

++ *

15635

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

15636

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

15637

++ *     Oct 1997.

15638

++ *

15639

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

15640

++ *

15641

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

15642

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

15643

++ *     Resource Allocation,'' technical report.

15644

++ *

15645

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

15646

++ */

15647

++#include <linux/module.h>

15648

++#include <linux/slab.h>

15649

++#include <linux/blkdev.h>

15650

++#include <linux/cgroup.h>

15651

++#include <linux/elevator.h>

15652

++#include <linux/jiffies.h>

15653

++#include <linux/rbtree.h>

15654

++#include <linux/ioprio.h>

15655

++#include "bfq.h"

15656

++#include "blk.h"

15657

++

15658

++/* Max number of dispatches in one round of service. */

15659

++static const int bfq_quantum = 4;

15660

++

15661

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

15662

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

15663

++

15664

++/* Maximum backwards seek, in KiB. */

15665

++static const int bfq_back_max = 16 * 1024;

15666

++

15667

++/* Penalty of a backwards seek, in number of sectors. */

15668

++static const int bfq_back_penalty = 2;

15669

++

15670

++/* Idling period duration, in jiffies. */

15671

++static int bfq_slice_idle = HZ / 125;

15672

++

15673

++/* Default maximum budget values, in sectors and number of requests. */

15674

++static const int bfq_default_max_budget = 16 * 1024;

15675

++static const int bfq_max_budget_async_rq = 4;

15676

++

15677

++/*

15678

++ * Async to sync throughput distribution is controlled as follows:

15679

++ * when an async request is served, the entity is charged the number

15680

++ * of sectors of the request, multipled by the factor below

15681

++ */

15682

++static const int bfq_async_charge_factor = 10;

15683

++

15684

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

15685

++static const int bfq_timeout_sync = HZ / 8;

15686

++static int bfq_timeout_async = HZ / 25;

15687

++

15688

++struct kmem_cache *bfq_pool;

15689

++

15690

++/* Below this threshold (in ms), we consider thinktime immediate. */

15691

++#define BFQ_MIN_TT		2

15692

++

15693

++/* hw_tag detection: parallel requests threshold and min samples needed. */

15694

++#define BFQ_HW_QUEUE_THRESHOLD	4

15695

++#define BFQ_HW_QUEUE_SAMPLES	32

15696

++

15697

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

15698

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

15699

++

15700

++/* Min samples used for peak rate estimation (for autotuning). */

15701

++#define BFQ_PEAK_RATE_SAMPLES	32

15702

++

15703

++/* Shift used for peak rate fixed precision calculations. */

15704

++#define BFQ_RATE_SHIFT		16

15705

++

15706

++/*

15707

++ * The duration of the weight raising for interactive applications is

15708

++ * computed automatically (as default behaviour), using the following

15709

++ * formula: duration = (R / r) * T, where r is the peak rate of the

15710

++ * disk, and R and T are two reference parameters. In particular, R is

15711

++ * the peak rate of a reference disk, and T is about the maximum time

15712

++ * for starting popular large applications on that disk, under BFQ and

15713

++ * while reading two files in parallel. Finally, BFQ uses two

15714

++ * different pairs (R, T) depending on whether the disk is rotational

15715

++ * or non-rotational.

15716

++ */

15717

++#define T_rot			(msecs_to_jiffies(5500))

15718

++#define T_nonrot		(msecs_to_jiffies(2000))

15719

++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

15720

++#define R_rot			17415

15721

++#define R_nonrot		34791

15722

++

15723

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

15724

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

15725

++

15726

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

15727

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

15728

++

15729

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

15730

++

15731

++#include "bfq-ioc.c"

15732

++#include "bfq-sched.c"

15733

++#include "bfq-cgroup.c"

15734

++

15735

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

15736

++				 IOPRIO_CLASS_IDLE)

15737

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

15738

++				 IOPRIO_CLASS_RT)

15739

++

15740

++#define bfq_sample_valid(samples)	((samples) > 80)

15741

++

15742

++/*

15743

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

15744

++ * set (in which case it could also be a direct WRITE).

15745

++ */

15746

++static inline int bfq_bio_sync(struct bio *bio)

15747

++{

15748

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

15749

++		return 1;

15750

++

15751

++	return 0;

15752

++}

15753

++

15754

++/*

15755

++ * Scheduler run of queue, if there are requests pending and no one in the

15756

++ * driver that will restart queueing.

15757

++ */

15758

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

15759

++{

15760

++	if (bfqd->queued != 0) {

15761

++		bfq_log(bfqd, "schedule dispatch");

15762

++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

15763

++	}

15764

++}

15765

++

15766

++/*

15767

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

15768

++ * We choose the request that is closesr to the head right now.  Distance

15769

++ * behind the head is penalized and only allowed to a certain extent.

15770

++ */

15771

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

15772

++				      struct request *rq1,

15773

++				      struct request *rq2,

15774

++				      sector_t last)

15775

++{

15776

++	sector_t s1, s2, d1 = 0, d2 = 0;

15777

++	unsigned long back_max;

15778

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

15779

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

15780

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

15781

++

15782

++	if (rq1 == NULL || rq1 == rq2)

15783

++		return rq2;

15784

++	if (rq2 == NULL)

15785

++		return rq1;

15786

++

15787

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

15788

++		return rq1;

15789

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

15790

++		return rq2;

15791

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

15792

++		return rq1;

15793

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

15794

++		return rq2;

15795

++

15796

++	s1 = blk_rq_pos(rq1);

15797

++	s2 = blk_rq_pos(rq2);

15798

++

15799

++	/*

15800

++	 * By definition, 1KiB is 2 sectors.

15801

++	 */

15802

++	back_max = bfqd->bfq_back_max * 2;

15803

++

15804

++	/*

15805

++	 * Strict one way elevator _except_ in the case where we allow

15806

++	 * short backward seeks which are biased as twice the cost of a

15807

++	 * similar forward seek.

15808

++	 */

15809

++	if (s1 >= last)

15810

++		d1 = s1 - last;

15811

++	else if (s1 + back_max >= last)

15812

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

15813

++	else

15814

++		wrap |= BFQ_RQ1_WRAP;

15815

++

15816

++	if (s2 >= last)

15817

++		d2 = s2 - last;

15818

++	else if (s2 + back_max >= last)

15819

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

15820

++	else

15821

++		wrap |= BFQ_RQ2_WRAP;

15822

++

15823

++	/* Found required data */

15824

++

15825

++	/*

15826

++	 * By doing switch() on the bit mask "wrap" we avoid having to

15827

++	 * check two variables for all permutations: --> faster!

15828

++	 */

15829

++	switch (wrap) {

15830

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

15831

++		if (d1 < d2)

15832

++			return rq1;

15833

++		else if (d2 < d1)

15834

++			return rq2;

15835

++		else {

15836

++			if (s1 >= s2)

15837

++				return rq1;

15838

++			else

15839

++				return rq2;

15840

++		}

15841

++

15842

++	case BFQ_RQ2_WRAP:

15843

++		return rq1;

15844

++	case BFQ_RQ1_WRAP:

15845

++		return rq2;

15846

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

15847

++	default:

15848

++		/*

15849

++		 * Since both rqs are wrapped,

15850

++		 * start with the one that's further behind head

15851

++		 * (--> only *one* back seek required),

15852

++		 * since back seek takes more time than forward.

15853

++		 */

15854

++		if (s1 <= s2)

15855

++			return rq1;

15856

++		else

15857

++			return rq2;

15858

++	}

15859

++}

15860

++

15861

++static struct bfq_queue *

15862

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

15863

++		     sector_t sector, struct rb_node **ret_parent,

15864

++		     struct rb_node ***rb_link)

15865

++{

15866

++	struct rb_node **p, *parent;

15867

++	struct bfq_queue *bfqq = NULL;

15868

++

15869

++	parent = NULL;

15870

++	p = &root->rb_node;

15871

++	while (*p) {

15872

++		struct rb_node **n;

15873

++

15874

++		parent = *p;

15875

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

15876

++

15877

++		/*

15878

++		 * Sort strictly based on sector. Smallest to the left,

15879

++		 * largest to the right.

15880

++		 */

15881

++		if (sector > blk_rq_pos(bfqq->next_rq))

15882

++			n = &(*p)->rb_right;

15883

++		else if (sector < blk_rq_pos(bfqq->next_rq))

15884

++			n = &(*p)->rb_left;

15885

++		else

15886

++			break;

15887

++		p = n;

15888

++		bfqq = NULL;

15889

++	}

15890

++

15891

++	*ret_parent = parent;

15892

++	if (rb_link)

15893

++		*rb_link = p;

15894

++

15895

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

15896

++		(long long unsigned)sector,

15897

++		bfqq != NULL ? bfqq->pid : 0);

15898

++

15899

++	return bfqq;

15900

++}

15901

++

15902

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

15903

++{

15904

++	struct rb_node **p, *parent;

15905

++	struct bfq_queue *__bfqq;

15906

++

15907

++	if (bfqq->pos_root != NULL) {

15908

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

15909

++		bfqq->pos_root = NULL;

15910

++	}

15911

++

15912

++	if (bfq_class_idle(bfqq))

15913

++		return;

15914

++	if (!bfqq->next_rq)

15915

++		return;

15916

++

15917

++	bfqq->pos_root = &bfqd->rq_pos_tree;

15918

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

15919

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

15920

++	if (__bfqq == NULL) {

15921

++		rb_link_node(&bfqq->pos_node, parent, p);

15922

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

15923

++	} else

15924

++		bfqq->pos_root = NULL;

15925

++}

15926

++

15927

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

15928

++					struct bfq_queue *bfqq,

15929

++					struct request *last)

15930

++{

15931

++	struct rb_node *rbnext = rb_next(&last->rb_node);

15932

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

15933

++	struct request *next = NULL, *prev = NULL;

15934

++

15935

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

15936

++

15937

++	if (rbprev != NULL)

15938

++		prev = rb_entry_rq(rbprev);

15939

++

15940

++	if (rbnext != NULL)

15941

++		next = rb_entry_rq(rbnext);

15942

++	else {

15943

++		rbnext = rb_first(&bfqq->sort_list);

15944

++		if (rbnext && rbnext != &last->rb_node)

15945

++			next = rb_entry_rq(rbnext);

15946

++	}

15947

++

15948

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

15949

++}

15950

++

15951

++static void bfq_del_rq_rb(struct request *rq)

15952

++{

15953

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

15954

++	struct bfq_data *bfqd = bfqq->bfqd;

15955

++	const int sync = rq_is_sync(rq);

15956

++

15957

++	BUG_ON(bfqq->queued[sync] == 0);

15958

++	bfqq->queued[sync]--;

15959

++	bfqd->queued--;

15960

++

15961

++	elv_rb_del(&bfqq->sort_list, rq);

15962

++

15963

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

15964

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

15965

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

15966

++		/*

15967

++		 * Remove queue from request-position tree as it is empty.

15968

++		 */

15969

++		if (bfqq->pos_root != NULL) {

15970

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

15971

++			bfqq->pos_root = NULL;

15972

++		}

15973

++	}

15974

++}

15975

++

15976

++/* see the definition of bfq_async_charge_factor for details */

15977

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

15978

++					       struct bfq_queue *bfqq)

15979

++{

15980

++	return blk_rq_sectors(rq) *

15981

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

15982

++		bfq_async_charge_factor));

15983

++}

15984

++

15985

++/**

15986

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

15987

++ * @bfqd: the device data the queue belongs to.

15988

++ * @bfqq: the queue to update.

15989

++ *

15990

++ * If the first request of a queue changes we make sure that the queue

15991

++ * has enough budget to serve at least its first request (if the

15992

++ * request has grown).  We do this because if the queue has not enough

15993

++ * budget for its first request, it has to go through two dispatch

15994

++ * rounds to actually get it dispatched.

15995

++ */

15996

++static void bfq_updated_next_req(struct bfq_data *bfqd,

15997

++				 struct bfq_queue *bfqq)

15998

++{

15999

++	struct bfq_entity *entity = &bfqq->entity;

16000

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

16001

++	struct request *next_rq = bfqq->next_rq;

16002

++	unsigned long new_budget;

16003

++

16004

++	if (next_rq == NULL)

16005

++		return;

16006

++

16007

++	if (bfqq == bfqd->in_service_queue)

16008

++		/*

16009

++		 * In order not to break guarantees, budgets cannot be

16010

++		 * changed after an entity has been selected.

16011

++		 */

16012

++		return;

16013

++

16014

++	BUG_ON(entity->tree != &st->active);

16015

++	BUG_ON(entity == entity->sched_data->in_service_entity);

16016

++

16017

++	new_budget = max_t(unsigned long, bfqq->max_budget,

16018

++			   bfq_serv_to_charge(next_rq, bfqq));

16019

++	entity->budget = new_budget;

16020

++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

16021

++	bfq_activate_bfqq(bfqd, bfqq);

16022

++}

16023

++

16024

++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

16025

++{

16026

++	u64 dur;

16027

++

16028

++	if (bfqd->bfq_raising_max_time > 0)

16029

++		return bfqd->bfq_raising_max_time;

16030

++

16031

++	dur = bfqd->RT_prod;

16032

++	do_div(dur, bfqd->peak_rate);

16033

++

16034

++	return dur;

16035

++}

16036

++

16037

++static void bfq_add_rq_rb(struct request *rq)

16038

++{

16039

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

16040

++	struct bfq_entity *entity = &bfqq->entity;

16041

++	struct bfq_data *bfqd = bfqq->bfqd;

16042

++	struct request *next_rq, *prev;

16043

++	unsigned long old_raising_coeff = bfqq->raising_coeff;

16044

++	int idle_for_long_time = 0;

16045

++

16046

++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

16047

++	bfqq->queued[rq_is_sync(rq)]++;

16048

++	bfqd->queued++;

16049

++

16050

++	elv_rb_add(&bfqq->sort_list, rq);

16051

++

16052

++	/*

16053

++	 * Check if this request is a better next-serve candidate.

16054

++	 */

16055

++	prev = bfqq->next_rq;

16056

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

16057

++	BUG_ON(next_rq == NULL);

16058

++	bfqq->next_rq = next_rq;

16059

++

16060

++	/*

16061

++	 * Adjust priority tree position, if next_rq changes.

16062

++	 */

16063

++	if (prev != bfqq->next_rq)

16064

++		bfq_rq_pos_tree_add(bfqd, bfqq);

16065

++

16066

++	if (!bfq_bfqq_busy(bfqq)) {

16067

++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

16068

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

16069

++		idle_for_long_time = time_is_before_jiffies(

16070

++			bfqq->budget_timeout +

16071

++			bfqd->bfq_raising_min_idle_time);

16072

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

16073

++				       bfq_serv_to_charge(next_rq, bfqq));

16074

++

16075

++		if (!bfqd->low_latency)

16076

++			goto add_bfqq_busy;

16077

++

16078

++		/*

16079

++		 * If the queue is not being boosted and has been idle

16080

++		 * for enough time, start a weight-raising period

16081

++		 */

16082

++		if (old_raising_coeff == 1 &&

16083

++		    (idle_for_long_time || soft_rt)) {

16084

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

16085

++			if (idle_for_long_time)

16086

++				bfqq->raising_cur_max_time =

16087

++					bfq_wrais_duration(bfqd);

16088

++			else

16089

++				bfqq->raising_cur_max_time =

16090

++					bfqd->bfq_raising_rt_max_time;

16091

++			bfq_log_bfqq(bfqd, bfqq,

16092

++				     "wrais starting at %lu, "

16093

++				     "rais_max_time %u",

16094

++				     jiffies,

16095

++				     jiffies_to_msecs(bfqq->

16096

++					raising_cur_max_time));

16097

++		} else if (old_raising_coeff > 1) {

16098

++			if (idle_for_long_time)

16099

++				bfqq->raising_cur_max_time =

16100

++					bfq_wrais_duration(bfqd);

16101

++			else if (bfqq->raising_cur_max_time ==

16102

++				 bfqd->bfq_raising_rt_max_time &&

16103

++				 !soft_rt) {

16104

++				bfqq->raising_coeff = 1;

16105

++				bfq_log_bfqq(bfqd, bfqq,

16106

++					     "wrais ending at %lu, "

16107

++					     "rais_max_time %u",

16108

++					     jiffies,

16109

++					     jiffies_to_msecs(bfqq->

16110

++						raising_cur_max_time));

16111

++			} else if (time_before(

16112

++					bfqq->last_rais_start_finish +

16113

++					bfqq->raising_cur_max_time,

16114

++					jiffies +

16115

++					bfqd->bfq_raising_rt_max_time) &&

16116

++				   soft_rt) {

16117

++				/*

16118

++				 *

16119

++				 * The remaining weight-raising time is lower

16120

++				 * than bfqd->bfq_raising_rt_max_time, which

16121

++				 * means that the application is enjoying

16122

++				 * weight raising either because deemed soft rt

16123

++				 * in the near past, or because deemed

16124

++				 * interactive a long ago. In both cases,

16125

++				 * resetting now the current remaining weight-

16126

++				 * raising time for the application to the

16127

++				 * weight-raising duration for soft rt

16128

++				 * applications would not cause any latency

16129

++				 * increase for the application (as the new

16130

++				 * duration would be higher than the remaining

16131

++				 * time).

16132

++				 *

16133

++				 * In addition, the application is now meeting

16134

++				 * the requirements for being deemed soft rt.

16135

++				 * In the end we can correctly and safely

16136

++				 * (re)charge the weight-raising duration for

16137

++				 * the application with the weight-raising

16138

++				 * duration for soft rt applications.

16139

++				 *

16140

++				 * In particular, doing this recharge now, i.e.,

16141

++				 * before the weight-raising period for the

16142

++				 * application finishes, reduces the probability

16143

++				 * of the following negative scenario:

16144

++				 * 1) the weight of a soft rt application is

16145

++				 *    raised at startup (as for any newly

16146

++				 *    created application),

16147

++				 * 2) since the application is not interactive,

16148

++				 *    at a certain time weight-raising is

16149

++				 *    stopped for the application,

16150

++				 * 3) at that time the application happens to

16151

++				 *    still have pending requests, and hence

16152

++				 *    is destined to not have a chance to be

16153

++				 *    deemed soft rt before these requests are

16154

++				 *    completed (see the comments to the

16155

++				 *    function bfq_bfqq_softrt_next_start()

16156

++				 *    for details on soft rt detection),

16157

++				 * 4) these pending requests experience a high

16158

++				 *    latency because the application is not

16159

++				 *    weight-raised while they are pending.

16160

++				 */

16161

++				bfqq->last_rais_start_finish = jiffies;

16162

++				bfqq->raising_cur_max_time =

16163

++					bfqd->bfq_raising_rt_max_time;

16164

++			}

16165

++		}

16166

++		if (old_raising_coeff != bfqq->raising_coeff)

16167

++			entity->ioprio_changed = 1;

16168

++add_bfqq_busy:

16169

++		bfqq->last_idle_bklogged = jiffies;

16170

++		bfqq->service_from_backlogged = 0;

16171

++		bfq_clear_bfqq_softrt_update(bfqq);

16172

++		bfq_add_bfqq_busy(bfqd, bfqq);

16173

++	} else {

16174

++		if (bfqd->low_latency && old_raising_coeff == 1 &&

16175

++			!rq_is_sync(rq) &&

16176

++			time_is_before_jiffies(

16177

++				bfqq->last_rais_start_finish +

16178

++				bfqd->bfq_raising_min_inter_arr_async)) {

16179

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

16180

++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

16181

++

16182

++			bfqd->raised_busy_queues++;

16183

++			entity->ioprio_changed = 1;

16184

++			bfq_log_bfqq(bfqd, bfqq,

16185

++				     "non-idle wrais starting at %lu, "

16186

++				     "rais_max_time %u",

16187

++				     jiffies,

16188

++				     jiffies_to_msecs(bfqq->

16189

++					raising_cur_max_time));

16190

++		}

16191

++		bfq_updated_next_req(bfqd, bfqq);

16192

++	}

16193

++

16194

++	if (bfqd->low_latency &&

16195

++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

16196

++		 idle_for_long_time))

16197

++		bfqq->last_rais_start_finish = jiffies;

16198

++}

16199

++

16200

++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

16201

++{

16202

++	elv_rb_del(&bfqq->sort_list, rq);

16203

++	bfqq->queued[rq_is_sync(rq)]--;

16204

++	bfqq->bfqd->queued--;

16205

++	bfq_add_rq_rb(rq);

16206

++}

16207

++

16208

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

16209

++					  struct bio *bio)

16210

++{

16211

++	struct task_struct *tsk = current;

16212

++	struct bfq_io_cq *bic;

16213

++	struct bfq_queue *bfqq;

16214

++

16215

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

16216

++	if (bic == NULL)

16217

++		return NULL;

16218

++

16219

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

16220

++	if (bfqq != NULL)

16221

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

16222

++

16223

++	return NULL;

16224

++}

16225

++

16226

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

16227

++{

16228

++	struct bfq_data *bfqd = q->elevator->elevator_data;

16229

++

16230

++	bfqd->rq_in_driver++;

16231

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

16232

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

16233

++		(long long unsigned)bfqd->last_position);

16234

++}

16235

++

16236

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

16237

++{

16238

++	struct bfq_data *bfqd = q->elevator->elevator_data;

16239

++

16240

++	WARN_ON(bfqd->rq_in_driver == 0);

16241

++	bfqd->rq_in_driver--;

16242

++}

16243

++

16244

++static void bfq_remove_request(struct request *rq)

16245

++{

16246

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

16247

++	struct bfq_data *bfqd = bfqq->bfqd;

16248

++

16249

++	if (bfqq->next_rq == rq) {

16250

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

16251

++		bfq_updated_next_req(bfqd, bfqq);

16252

++	}

16253

++

16254

++	list_del_init(&rq->queuelist);

16255

++	bfq_del_rq_rb(rq);

16256

++

16257

++	if (rq->cmd_flags & REQ_META) {

16258

++		WARN_ON(bfqq->meta_pending == 0);

16259

++		bfqq->meta_pending--;

16260

++	}

16261

++}

16262

++

16263

++static int bfq_merge(struct request_queue *q, struct request **req,

16264

++		     struct bio *bio)

16265

++{

16266

++	struct bfq_data *bfqd = q->elevator->elevator_data;

16267

++	struct request *__rq;

16268

++

16269

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

16270

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

16271

++		*req = __rq;

16272

++		return ELEVATOR_FRONT_MERGE;

16273

++	}

16274

++

16275

++	return ELEVATOR_NO_MERGE;

16276

++}

16277

++

16278

++static void bfq_merged_request(struct request_queue *q, struct request *req,

16279

++			       int type)

16280

++{

16281

++	if (type == ELEVATOR_FRONT_MERGE) {

16282

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

16283

++

16284

++		bfq_reposition_rq_rb(bfqq, req);

16285

++	}

16286

++}

16287

++

16288

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

16289

++				struct request *next)

16290

++{

16291

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

16292

++

16293

++	/*

16294

++	 * Reposition in fifo if next is older than rq.

16295

++	 */

16296

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

16297

++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

16298

++		list_move(&rq->queuelist, &next->queuelist);

16299

++		rq_set_fifo_time(rq, rq_fifo_time(next));

16300

++	}

16301

++

16302

++	if (bfqq->next_rq == next)

16303

++		bfqq->next_rq = rq;

16304

++

16305

++	bfq_remove_request(next);

16306

++}

16307

++

16308

++/* Must be called with bfqq != NULL */

16309

++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

16310

++{

16311

++	BUG_ON(bfqq == NULL);

16312

++	if (bfq_bfqq_busy(bfqq))

16313

++		bfqq->bfqd->raised_busy_queues--;

16314

++	bfqq->raising_coeff = 1;

16315

++	bfqq->raising_cur_max_time = 0;

16316

++	/* Trigger a weight change on the next activation of the queue */

16317

++	bfqq->entity.ioprio_changed = 1;

16318

++}

16319

++

16320

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

16321

++					struct bfq_group *bfqg)

16322

++{

16323

++	int i, j;

16324

++

16325

++	for (i = 0; i < 2; i++)

16326

++		for (j = 0; j < IOPRIO_BE_NR; j++)

16327

++			if (bfqg->async_bfqq[i][j] != NULL)

16328

++				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

16329

++	if (bfqg->async_idle_bfqq != NULL)

16330

++		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

16331

++}

16332

++

16333

++static void bfq_end_raising(struct bfq_data *bfqd)

16334

++{

16335

++	struct bfq_queue *bfqq;

16336

++

16337

++	spin_lock_irq(bfqd->queue->queue_lock);

16338

++

16339

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

16340

++		bfq_bfqq_end_raising(bfqq);

16341

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

16342

++		bfq_bfqq_end_raising(bfqq);

16343

++	bfq_end_raising_async(bfqd);

16344

++

16345

++	spin_unlock_irq(bfqd->queue->queue_lock);

16346

++}

16347

++

16348

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

16349

++			   struct bio *bio)

16350

++{

16351

++	struct bfq_data *bfqd = q->elevator->elevator_data;

16352

++	struct bfq_io_cq *bic;

16353

++	struct bfq_queue *bfqq;

16354

++

16355

++	/*

16356

++	 * Disallow merge of a sync bio into an async request.

16357

++	 */

16358

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

16359

++		return 0;

16360

++

16361

++	/*

16362

++	 * Lookup the bfqq that this bio will be queued with. Allow

16363

++	 * merge only if rq is queued there.

16364

++	 * Queue lock is held here.

16365

++	 */

16366

++	bic = bfq_bic_lookup(bfqd, current->io_context);

16367

++	if (bic == NULL)

16368

++		return 0;

16369

++

16370

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

16371

++	return bfqq == RQ_BFQQ(rq);

16372

++}

16373

++

16374

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

16375

++				       struct bfq_queue *bfqq)

16376

++{

16377

++	if (bfqq != NULL) {

16378

++		bfq_mark_bfqq_must_alloc(bfqq);

16379

++		bfq_mark_bfqq_budget_new(bfqq);

16380

++		bfq_clear_bfqq_fifo_expire(bfqq);

16381

++

16382

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

16383

++

16384

++		bfq_log_bfqq(bfqd, bfqq,

16385

++			     "set_in_service_queue, cur-budget = %lu",

16386

++			     bfqq->entity.budget);

16387

++	}

16388

++

16389

++	bfqd->in_service_queue = bfqq;

16390

++}

16391

++

16392

++/*

16393

++ * Get and set a new queue for service.

16394

++ */

16395

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

16396

++						  struct bfq_queue *bfqq)

16397

++{

16398

++	if (!bfqq)

16399

++		bfqq = bfq_get_next_queue(bfqd);

16400

++	else

16401

++		bfq_get_next_queue_forced(bfqd, bfqq);

16402

++

16403

++	__bfq_set_in_service_queue(bfqd, bfqq);

16404

++	return bfqq;

16405

++}

16406

++

16407

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

16408

++					  struct request *rq)

16409

++{

16410

++	if (blk_rq_pos(rq) >= bfqd->last_position)

16411

++		return blk_rq_pos(rq) - bfqd->last_position;

16412

++	else

16413

++		return bfqd->last_position - blk_rq_pos(rq);

16414

++}

16415

++

16416

++/*

16417

++ * Return true if bfqq has no request pending and rq is close enough to

16418

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

16419

++ * bfqq->next_rq

16420

++ */

16421

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

16422

++{

16423

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

16424

++}

16425

++

16426

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

16427

++{

16428

++	struct rb_root *root = &bfqd->rq_pos_tree;

16429

++	struct rb_node *parent, *node;

16430

++	struct bfq_queue *__bfqq;

16431

++	sector_t sector = bfqd->last_position;

16432

++

16433

++	if (RB_EMPTY_ROOT(root))

16434

++		return NULL;

16435

++

16436

++	/*

16437

++	 * First, if we find a request starting at the end of the last

16438

++	 * request, choose it.

16439

++	 */

16440

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

16441

++	if (__bfqq != NULL)

16442

++		return __bfqq;

16443

++

16444

++	/*

16445

++	 * If the exact sector wasn't found, the parent of the NULL leaf

16446

++	 * will contain the closest sector (rq_pos_tree sorted by next_request

16447

++	 * position).

16448

++	 */

16449

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

16450

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

16451

++		return __bfqq;

16452

++

16453

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

16454

++		node = rb_next(&__bfqq->pos_node);

16455

++	else

16456

++		node = rb_prev(&__bfqq->pos_node);

16457

++	if (node == NULL)

16458

++		return NULL;

16459

++

16460

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

16461

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

16462

++		return __bfqq;

16463

++

16464

++	return NULL;

16465

++}

16466

++

16467

++/*

16468

++ * bfqd - obvious

16469

++ * cur_bfqq - passed in so that we don't decide that the current queue

16470

++ *            is closely cooperating with itself.

16471

++ *

16472

++ * We are assuming that cur_bfqq has dispatched at least one request,

16473

++ * and that bfqd->last_position reflects a position on the disk associated

16474

++ * with the I/O issued by cur_bfqq.

16475

++ */

16476

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

16477

++					      struct bfq_queue *cur_bfqq)

16478

++{

16479

++	struct bfq_queue *bfqq;

16480

++

16481

++	if (bfq_class_idle(cur_bfqq))

16482

++		return NULL;

16483

++	if (!bfq_bfqq_sync(cur_bfqq))

16484

++		return NULL;

16485

++	if (BFQQ_SEEKY(cur_bfqq))

16486

++		return NULL;

16487

++

16488

++	/* If device has only one backlogged bfq_queue, don't search. */

16489

++	if (bfqd->busy_queues == 1)

16490

++		return NULL;

16491

++

16492

++	/*

16493

++	 * We should notice if some of the queues are cooperating, e.g.

16494

++	 * working closely on the same area of the disk. In that case,

16495

++	 * we can group them together and don't waste time idling.

16496

++	 */

16497

++	bfqq = bfqq_close(bfqd);

16498

++	if (bfqq == NULL || bfqq == cur_bfqq)

16499

++		return NULL;

16500

++

16501

++	/*

16502

++	 * Do not merge queues from different bfq_groups.

16503

++	*/

16504

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

16505

++		return NULL;

16506

++

16507

++	/*

16508

++	 * It only makes sense to merge sync queues.

16509

++	 */

16510

++	if (!bfq_bfqq_sync(bfqq))

16511

++		return NULL;

16512

++	if (BFQQ_SEEKY(bfqq))

16513

++		return NULL;

16514

++

16515

++	/*

16516

++	 * Do not merge queues of different priority classes.

16517

++	 */

16518

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

16519

++		return NULL;

16520

++

16521

++	return bfqq;

16522

++}

16523

++

16524

++/*

16525

++ * If enough samples have been computed, return the current max budget

16526

++ * stored in bfqd, which is dynamically updated according to the

16527

++ * estimated disk peak rate; otherwise return the default max budget

16528

++ */

16529

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

16530

++{

16531

++	if (bfqd->budgets_assigned < 194)

16532

++		return bfq_default_max_budget;

16533

++	else

16534

++		return bfqd->bfq_max_budget;

16535

++}

16536

++

16537

++/*

16538

++ * Return min budget, which is a fraction of the current or default

16539

++ * max budget (trying with 1/32)

16540

++ */

16541

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

16542

++{

16543

++	if (bfqd->budgets_assigned < 194)

16544

++		return bfq_default_max_budget / 32;

16545

++	else

16546

++		return bfqd->bfq_max_budget / 32;

16547

++}

16548

++

16549

++/*

16550

++ * Decides whether idling should be done for given device and

16551

++ * given in-service queue.

16552

++ */

16553

++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

16554

++					   struct bfq_queue *in_service_bfqq)

16555

++{

16556

++	if (in_service_bfqq == NULL)

16557

++		return false;

16558

++	/*

16559

++	 * If the device is non-rotational, and hence has no seek penalty,

16560

++	 * disable idling; but do so only if:

16561

++	 * - device does not support queuing, otherwise we still have

16562

++	 *   a problem with sync vs async workloads;

16563

++	 * - the queue is not weight-raised, to preserve guarantees.

16564

++	 */

16565

++	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

16566

++		in_service_bfqq->raising_coeff == 1);

16567

++}

16568

++

16569

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

16570

++{

16571

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

16572

++	struct bfq_io_cq *bic;

16573

++	unsigned long sl;

16574

++

16575

++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

16576

++

16577

++	/* Tasks have exited, don't wait. */

16578

++	bic = bfqd->in_service_bic;

16579

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

16580

++		return;

16581

++

16582

++	bfq_mark_bfqq_wait_request(bfqq);

16583

++

16584

++	/*

16585

++	 * We don't want to idle for seeks, but we do want to allow

16586

++	 * fair distribution of slice time for a process doing back-to-back

16587

++	 * seeks. So allow a little bit of time for him to submit a new rq.

16588

++	 *

16589

++	 * To prevent processes with (partly) seeky workloads from

16590

++	 * being too ill-treated, grant them a small fraction of the

16591

++	 * assigned budget before reducing the waiting time to

16592

++	 * BFQ_MIN_TT. This happened to help reduce latency.

16593

++	 */

16594

++	sl = bfqd->bfq_slice_idle;

16595

++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

16596

++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

16597

++	    bfqq->raising_coeff == 1)

16598

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

16599

++	else if (bfqq->raising_coeff > 1)

16600

++		sl = sl * 3;

16601

++	bfqd->last_idling_start = ktime_get();

16602

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

16603

++	bfq_log(bfqd, "arm idle: %u/%u ms",

16604

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

16605

++}

16606

++

16607

++/*

16608

++ * Set the maximum time for the in-service queue to consume its

16609

++ * budget. This prevents seeky processes from lowering the disk

16610

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

16611

++ */

16612

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

16613

++{

16614

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

16615

++	unsigned int timeout_coeff;

16616

++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

16617

++		timeout_coeff = 1;

16618

++	else

16619

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

16620

++

16621

++	bfqd->last_budget_start = ktime_get();

16622

++

16623

++	bfq_clear_bfqq_budget_new(bfqq);

16624

++	bfqq->budget_timeout = jiffies +

16625

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

16626

++

16627

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

16628

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

16629

++		timeout_coeff));

16630

++}

16631

++

16632

++/*

16633

++ * Move request from internal lists to the request queue dispatch list.

16634

++ */

16635

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

16636

++{

16637

++	struct bfq_data *bfqd = q->elevator->elevator_data;

16638

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

16639

++

16640

++	bfq_remove_request(rq);

16641

++	bfqq->dispatched++;

16642

++	elv_dispatch_sort(q, rq);

16643

++

16644

++	if (bfq_bfqq_sync(bfqq))

16645

++		bfqd->sync_flight++;

16646

++}

16647

++

16648

++/*

16649

++ * Return expired entry, or NULL to just start from scratch in rbtree.

16650

++ */

16651

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

16652

++{

16653

++	struct request *rq = NULL;

16654

++

16655

++	if (bfq_bfqq_fifo_expire(bfqq))

16656

++		return NULL;

16657

++

16658

++	bfq_mark_bfqq_fifo_expire(bfqq);

16659

++

16660

++	if (list_empty(&bfqq->fifo))

16661

++		return NULL;

16662

++

16663

++	rq = rq_entry_fifo(bfqq->fifo.next);

16664

++

16665

++	if (time_before(jiffies, rq_fifo_time(rq)))

16666

++		return NULL;

16667

++

16668

++	return rq;

16669

++}

16670

++

16671

++/*

16672

++ * Must be called with the queue_lock held.

16673

++ */

16674

++static int bfqq_process_refs(struct bfq_queue *bfqq)

16675

++{

16676

++	int process_refs, io_refs;

16677

++

16678

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

16679

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

16680

++	BUG_ON(process_refs < 0);

16681

++	return process_refs;

16682

++}

16683

++

16684

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

16685

++{

16686

++	int process_refs, new_process_refs;

16687

++	struct bfq_queue *__bfqq;

16688

++

16689

++	/*

16690

++	 * If there are no process references on the new_bfqq, then it is

16691

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

16692

++	 * may have dropped their last reference (not just their last process

16693

++	 * reference).

16694

++	 */

16695

++	if (!bfqq_process_refs(new_bfqq))

16696

++		return;

16697

++

16698

++	/* Avoid a circular list and skip interim queue merges. */

16699

++	while ((__bfqq = new_bfqq->new_bfqq)) {

16700

++		if (__bfqq == bfqq)

16701

++			return;

16702

++		new_bfqq = __bfqq;

16703

++	}

16704

++

16705

++	process_refs = bfqq_process_refs(bfqq);

16706

++	new_process_refs = bfqq_process_refs(new_bfqq);

16707

++	/*

16708

++	 * If the process for the bfqq has gone away, there is no

16709

++	 * sense in merging the queues.

16710

++	 */

16711

++	if (process_refs == 0 || new_process_refs == 0)

16712

++		return;

16713

++

16714

++	/*

16715

++	 * Merge in the direction of the lesser amount of work.

16716

++	 */

16717

++	if (new_process_refs >= process_refs) {

16718

++		bfqq->new_bfqq = new_bfqq;

16719

++		atomic_add(process_refs, &new_bfqq->ref);

16720

++	} else {

16721

++		new_bfqq->new_bfqq = bfqq;

16722

++		atomic_add(new_process_refs, &bfqq->ref);

16723

++	}

16724

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

16725

++		new_bfqq->pid);

16726

++}

16727

++

16728

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

16729

++{

16730

++	struct bfq_entity *entity = &bfqq->entity;

16731

++	return entity->budget - entity->service;

16732

++}

16733

++

16734

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

16735

++{

16736

++	BUG_ON(bfqq != bfqd->in_service_queue);

16737

++

16738

++	__bfq_bfqd_reset_in_service(bfqd);

16739

++

16740

++	/*

16741

++	 * If this bfqq is shared between multiple processes, check

16742

++	 * to make sure that those processes are still issuing I/Os

16743

++	 * within the mean seek distance. If not, it may be time to

16744

++	 * break the queues apart again.

16745

++	 */

16746

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

16747

++		bfq_mark_bfqq_split_coop(bfqq);

16748

++

16749

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

16750

++		/*

16751

++		 * overloading budget_timeout field to store when

16752

++		 * the queue remains with no backlog, used by

16753

++		 * the weight-raising mechanism

16754

++		 */

16755

++		bfqq->budget_timeout = jiffies;

16756

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

16757

++	} else {

16758

++		bfq_activate_bfqq(bfqd, bfqq);

16759

++		/*

16760

++		 * Resort priority tree of potential close cooperators.

16761

++		 */

16762

++		bfq_rq_pos_tree_add(bfqd, bfqq);

16763

++	}

16764

++}

16765

++

16766

++/**

16767

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

16768

++ * @bfqd: device data.

16769

++ * @bfqq: queue to update.

16770

++ * @reason: reason for expiration.

16771

++ *

16772

++ * Handle the feedback on @bfqq budget.  See the body for detailed

16773

++ * comments.

16774

++ */

16775

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

16776

++				     struct bfq_queue *bfqq,

16777

++				     enum bfqq_expiration reason)

16778

++{

16779

++	struct request *next_rq;

16780

++	unsigned long budget, min_budget;

16781

++

16782

++	budget = bfqq->max_budget;

16783

++	min_budget = bfq_min_budget(bfqd);

16784

++

16785

++	BUG_ON(bfqq != bfqd->in_service_queue);

16786

++

16787

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

16788

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

16789

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

16790

++		budget, bfq_min_budget(bfqd));

16791

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

16792

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

16793

++

16794

++	if (bfq_bfqq_sync(bfqq)) {

16795

++		switch (reason) {

16796

++		/*

16797

++		 * Caveat: in all the following cases we trade latency

16798

++		 * for throughput.

16799

++		 */

16800

++		case BFQ_BFQQ_TOO_IDLE:

16801

++			/*

16802

++			 * This is the only case where we may reduce

16803

++			 * the budget: if there is no requets of the

16804

++			 * process still waiting for completion, then

16805

++			 * we assume (tentatively) that the timer has

16806

++			 * expired because the batch of requests of

16807

++			 * the process could have been served with a

16808

++			 * smaller budget.  Hence, betting that

16809

++			 * process will behave in the same way when it

16810

++			 * becomes backlogged again, we reduce its

16811

++			 * next budget.  As long as we guess right,

16812

++			 * this budget cut reduces the latency

16813

++			 * experienced by the process.

16814

++			 *

16815

++			 * However, if there are still outstanding

16816

++			 * requests, then the process may have not yet

16817

++			 * issued its next request just because it is

16818

++			 * still waiting for the completion of some of

16819

++			 * the still oustanding ones.  So in this

16820

++			 * subcase we do not reduce its budget, on the

16821

++			 * contrary we increase it to possibly boost

16822

++			 * the throughput, as discussed in the

16823

++			 * comments to the BUDGET_TIMEOUT case.

16824

++			 */

16825

++			if (bfqq->dispatched > 0) /* still oustanding reqs */

16826

++				budget = min(budget * 2, bfqd->bfq_max_budget);

16827

++			else {

16828

++				if (budget > 5 * min_budget)

16829

++					budget -= 4 * min_budget;

16830

++				else

16831

++					budget = min_budget;

16832

++			}

16833

++			break;

16834

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

16835

++			/*

16836

++			 * We double the budget here because: 1) it

16837

++			 * gives the chance to boost the throughput if

16838

++			 * this is not a seeky process (which may have

16839

++			 * bumped into this timeout because of, e.g.,

16840

++			 * ZBR), 2) together with charge_full_budget

16841

++			 * it helps give seeky processes higher

16842

++			 * timestamps, and hence be served less

16843

++			 * frequently.

16844

++			 */

16845

++			budget = min(budget * 2, bfqd->bfq_max_budget);

16846

++			break;

16847

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

16848

++			/*

16849

++			 * The process still has backlog, and did not

16850

++			 * let either the budget timeout or the disk

16851

++			 * idling timeout expire. Hence it is not

16852

++			 * seeky, has a short thinktime and may be

16853

++			 * happy with a higher budget too. So

16854

++			 * definitely increase the budget of this good

16855

++			 * candidate to boost the disk throughput.

16856

++			 */

16857

++			budget = min(budget * 4, bfqd->bfq_max_budget);

16858

++			break;

16859

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

16860

++		       /*

16861

++			* Leave the budget unchanged.

16862

++			*/

16863

++		default:

16864

++			return;

16865

++		}

16866

++	} else /* async queue */

16867

++	    /* async queues get always the maximum possible budget

16868

++	     * (their ability to dispatch is limited by

16869

++	     * @bfqd->bfq_max_budget_async_rq).

16870

++	     */

16871

++		budget = bfqd->bfq_max_budget;

16872

++

16873

++	bfqq->max_budget = budget;

16874

++

16875

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

16876

++	    bfqq->max_budget > bfqd->bfq_max_budget)

16877

++		bfqq->max_budget = bfqd->bfq_max_budget;

16878

++

16879

++	/*

16880

++	 * Make sure that we have enough budget for the next request.

16881

++	 * Since the finish time of the bfqq must be kept in sync with

16882

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

16883

++	 * update.

16884

++	 */

16885

++	next_rq = bfqq->next_rq;

16886

++	if (next_rq != NULL)

16887

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

16888

++					    bfq_serv_to_charge(next_rq, bfqq));

16889

++	else

16890

++		bfqq->entity.budget = bfqq->max_budget;

16891

++

16892

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

16893

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

16894

++			bfqq->entity.budget);

16895

++}

16896

++

16897

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

16898

++{

16899

++	unsigned long max_budget;

16900

++

16901

++	/*

16902

++	 * The max_budget calculated when autotuning is equal to the

16903

++	 * amount of sectors transfered in timeout_sync at the

16904

++	 * estimated peak rate.

16905

++	 */

16906

++	max_budget = (unsigned long)(peak_rate * 1000 *

16907

++				     timeout >> BFQ_RATE_SHIFT);

16908

++

16909

++	return max_budget;

16910

++}

16911

++

16912

++/*

16913

++ * In addition to updating the peak rate, checks whether the process

16914

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

16915

++ * to the budget timeout, to reduce the amount of service provided to

16916

++ * seeky processes, and hence reduce their chances to lower the

16917

++ * throughput. See the code for more details.

16918

++ */

16919

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

16920

++				int compensate, enum bfqq_expiration reason)

16921

++{

16922

++	u64 bw, usecs, expected, timeout;

16923

++	ktime_t delta;

16924

++	int update = 0;

16925

++

16926

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

16927

++		return 0;

16928

++

16929

++	if (compensate)

16930

++		delta = bfqd->last_idling_start;

16931

++	else

16932

++		delta = ktime_get();

16933

++	delta = ktime_sub(delta, bfqd->last_budget_start);

16934

++	usecs = ktime_to_us(delta);

16935

++

16936

++	/* Don't trust short/unrealistic values. */

16937

++	if (usecs < 100 || usecs >= LONG_MAX)

16938

++		return 0;

16939

++

16940

++	/*

16941

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

16942

++	 * value to store the peak rate, in sectors per usec in fixed

16943

++	 * point math.  We do so to have enough precision in the estimate

16944

++	 * and to avoid overflows.

16945

++	 */

16946

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

16947

++	do_div(bw, (unsigned long)usecs);

16948

++

16949

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

16950

++

16951

++	/*

16952

++	 * Use only long (> 20ms) intervals to filter out spikes for

16953

++	 * the peak rate estimation.

16954

++	 */

16955

++	if (usecs > 20000) {

16956

++		if (bw > bfqd->peak_rate ||

16957

++		   (!BFQQ_SEEKY(bfqq) &&

16958

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

16959

++			bfq_log(bfqd, "measured bw =%llu", bw);

16960

++			/*

16961

++			 * To smooth oscillations use a low-pass filter with

16962

++			 * alpha=7/8, i.e.,

16963

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

16964

++			 */

16965

++			do_div(bw, 8);

16966

++			if (bw == 0)

16967

++				return 0;

16968

++			bfqd->peak_rate *= 7;

16969

++			do_div(bfqd->peak_rate, 8);

16970

++			bfqd->peak_rate += bw;

16971

++			update = 1;

16972

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

16973

++		}

16974

++

16975

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

16976

++

16977

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

16978

++			bfqd->peak_rate_samples++;

16979

++

16980

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

16981

++		    update && bfqd->bfq_user_max_budget == 0) {

16982

++			bfqd->bfq_max_budget =

16983

++				bfq_calc_max_budget(bfqd->peak_rate, timeout);

16984

++			bfq_log(bfqd, "new max_budget=%lu",

16985

++				bfqd->bfq_max_budget);

16986

++		}

16987

++	}

16988

++

16989

++	/*

16990

++	 * If the process has been served for a too short time

16991

++	 * interval to let its possible sequential accesses prevail on

16992

++	 * the initial seek time needed to move the disk head on the

16993

++	 * first sector it requested, then give the process a chance

16994

++	 * and for the moment return false.

16995

++	 */

16996

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

16997

++		return 0;

16998

++

16999

++	/*

17000

++	 * A process is considered ``slow'' (i.e., seeky, so that we

17001

++	 * cannot treat it fairly in the service domain, as it would

17002

++	 * slow down too much the other processes) if, when a slice

17003

++	 * ends for whatever reason, it has received service at a

17004

++	 * rate that would not be high enough to complete the budget

17005

++	 * before the budget timeout expiration.

17006

++	 */

17007

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

17008

++

17009

++	/*

17010

++	 * Caveat: processes doing IO in the slower disk zones will

17011

++	 * tend to be slow(er) even if not seeky. And the estimated

17012

++	 * peak rate will actually be an average over the disk

17013

++	 * surface. Hence, to not be too harsh with unlucky processes,

17014

++	 * we keep a budget/3 margin of safety before declaring a

17015

++	 * process slow.

17016

++	 */

17017

++	return expected > (4 * bfqq->entity.budget) / 3;

17018

++}

17019

++

17020

++/*

17021

++ * To be deemed as soft real-time, an application must meet two requirements.

17022

++ * The first is that the application must not require an average bandwidth

17023

++ * higher than the approximate bandwidth required to playback or record a

17024

++ * compressed high-definition video.

17025

++ * The next function is invoked on the completion of the last request of a

17026

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

17027

++ * that, if the next request of the application does not arrive before

17028

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

17029

++ *

17030

++ * The second requirement is that the request pattern of the application is

17031

++ * isochronous, i.e., that, after issuing a request or a batch of requests, the

17032

++ * application stops for a while, then issues a new batch, and so on. For this

17033

++ * reason the next function is invoked to compute soft_rt_next_start only for

17034

++ * applications that meet this requirement, whereas soft_rt_next_start is set

17035

++ * to infinity for applications that do not.

17036

++ *

17037

++ * Unfortunately, even a greedy application may happen to behave in an

17038

++ * isochronous way if several processes are competing for the CPUs. In fact,

17039

++ * in this scenario the application stops issuing requests while the CPUs are

17040

++ * busy serving other processes, then restarts, then stops again for a while,

17041

++ * and so on. In addition, if the disk achieves a low enough throughput with

17042

++ * the request pattern issued by the application (e.g., because the request

17043

++ * pattern is random and/or the device is slow), then the above bandwidth

17044

++ * requirement may happen to be met too. To prevent such a greedy application

17045

++ * to be deemed as soft real-time, a further rule is used in the computation

17046

++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

17047

++ * time plus the maximum time for which the arrival of a request is waited

17048

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

17049

++ * filters out greedy applications, as the latter issue instead their next

17050

++ * request as soon as possible after the last one has been completed (in

17051

++ * contrast, when a batch of requests is completed, a soft real-time

17052

++ * application spends some time processing data).

17053

++ *

17054

++ * Actually, the last filter may easily generate false positives if: only

17055

++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

17056

++ * both the following two cases occur:

17057

++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

17058

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

17059

++ *    HZ=100.

17060

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

17061

++ *    for a while, then suddenly 'jump' by several units to recover the lost

17062

++ *    increments. This seems to happen, e.g., inside virtual machines.

17063

++ * To address this issue, we do not use as a reference time interval just

17064

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

17065

++ * particular we add the minimum number of jiffies for which the filter seems

17066

++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

17067

++ */

17068

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

17069

++						       struct bfq_queue *bfqq)

17070

++{

17071

++	return max(bfqq->last_idle_bklogged +

17072

++		   HZ * bfqq->service_from_backlogged /

17073

++		   bfqd->bfq_raising_max_softrt_rate,

17074

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

17075

++}

17076

++

17077

++/*

17078

++ * Largest-possible time instant such that, for as long as possible, the

17079

++ * current time will be lower than this time instant according to the macro

17080

++ * time_is_before_jiffies().

17081

++ */

17082

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

17083

++{

17084

++	return now + ULONG_MAX / 2;

17085

++}

17086

++

17087

++/**

17088

++ * bfq_bfqq_expire - expire a queue.

17089

++ * @bfqd: device owning the queue.

17090

++ * @bfqq: the queue to expire.

17091

++ * @compensate: if true, compensate for the time spent idling.

17092

++ * @reason: the reason causing the expiration.

17093

++ *

17094

++ *

17095

++ * If the process associated to the queue is slow (i.e., seeky), or in

17096

++ * case of budget timeout, or, finally, if it is async, we

17097

++ * artificially charge it an entire budget (independently of the

17098

++ * actual service it received). As a consequence, the queue will get

17099

++ * higher timestamps than the correct ones upon reactivation, and

17100

++ * hence it will be rescheduled as if it had received more service

17101

++ * than what it actually received. In the end, this class of processes

17102

++ * will receive less service in proportion to how slowly they consume

17103

++ * their budgets (and hence how seriously they tend to lower the

17104

++ * throughput).

17105

++ *

17106

++ * In contrast, when a queue expires because it has been idling for

17107

++ * too much or because it exhausted its budget, we do not touch the

17108

++ * amount of service it has received. Hence when the queue will be

17109

++ * reactivated and its timestamps updated, the latter will be in sync

17110

++ * with the actual service received by the queue until expiration.

17111

++ *

17112

++ * Charging a full budget to the first type of queues and the exact

17113

++ * service to the others has the effect of using the WF2Q+ policy to

17114

++ * schedule the former on a timeslice basis, without violating the

17115

++ * service domain guarantees of the latter.

17116

++ */

17117

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

17118

++			    struct bfq_queue *bfqq,

17119

++			    int compensate,

17120

++			    enum bfqq_expiration reason)

17121

++{

17122

++	int slow;

17123

++	BUG_ON(bfqq != bfqd->in_service_queue);

17124

++

17125

++	/* Update disk peak rate for autotuning and check whether the

17126

++	 * process is slow (see bfq_update_peak_rate).

17127

++	 */

17128

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

17129

++

17130

++	/*

17131

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

17132

++	 * and async queues, to favor sequential sync workloads.

17133

++	 *

17134

++	 * Processes doing IO in the slower disk zones will tend to be

17135

++	 * slow(er) even if not seeky. Hence, since the estimated peak

17136

++	 * rate is actually an average over the disk surface, these

17137

++	 * processes may timeout just for bad luck. To avoid punishing

17138

++	 * them we do not charge a full budget to a process that

17139

++	 * succeeded in consuming at least 2/3 of its budget.

17140

++	 */

17141

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

17142

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

17143

++		bfq_bfqq_charge_full_budget(bfqq);

17144

++

17145

++	bfqq->service_from_backlogged += bfqq->entity.service;

17146

++

17147

++	if (bfqd->low_latency && bfqq->raising_coeff == 1)

17148

++		bfqq->last_rais_start_finish = jiffies;

17149

++

17150

++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&

17151

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

17152

++		/*

17153

++		 * If we get here, then the request pattern is

17154

++		 * isochronous (see the comments to the function

17155

++		 * bfq_bfqq_softrt_next_start()). However, if the

17156

++		 * queue still has in-flight requests, then it is

17157

++		 * better to postpone the computation of next_start

17158

++		 * to the next request completion. In fact, if we

17159

++		 * computed it now, then the application might pass

17160

++		 * the greedy-application filter improperly, because

17161

++		 * the arrival of its next request may  happen to be

17162

++		 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

17163

++		 * not because the application is truly soft real-

17164

++		 * time, but just because the application is currently

17165

++		 * waiting for the completion of some request before

17166

++		 * issuing, as quickly as possible, its next request.

17167

++		 */

17168

++		if (bfqq->dispatched > 0) {

17169

++			/*

17170

++			 * The application is still waiting for the

17171

++			 * completion of one or more requests:

17172

++			 * prevent it from possibly being incorrectly

17173

++			 * deemed as soft real-time by setting its

17174

++			 * soft_rt_next_start to infinity. In fact,

17175

++			 * without this assignment, the application

17176

++			 * would be incorrectly deemed as soft

17177

++			 * real-time if:

17178

++			 * 1) it issued a new request before the

17179

++			 *    completion of all its in-flight

17180

++			 *    requests, and

17181

++			 * 2) at that time, its soft_rt_next_start

17182

++			 *    happened to be in the past.

17183

++			 */

17184

++			bfqq->soft_rt_next_start =

17185

++				bfq_infinity_from_now(jiffies);

17186

++			bfq_mark_bfqq_softrt_update(bfqq);

17187

++		} else

17188

++			bfqq->soft_rt_next_start =

17189

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

17190

++	}

17191

++

17192

++	bfq_log_bfqq(bfqd, bfqq,

17193

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

17194

++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

17195

++

17196

++	/* Increase, decrease or leave budget unchanged according to reason */

17197

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

17198

++	__bfq_bfqq_expire(bfqd, bfqq);

17199

++}

17200

++

17201

++/*

17202

++ * Budget timeout is not implemented through a dedicated timer, but

17203

++ * just checked on request arrivals and completions, as well as on

17204

++ * idle timer expirations.

17205

++ */

17206

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

17207

++{

17208

++	if (bfq_bfqq_budget_new(bfqq))

17209

++		return 0;

17210

++

17211

++	if (time_before(jiffies, bfqq->budget_timeout))

17212

++		return 0;

17213

++

17214

++	return 1;

17215

++}

17216

++

17217

++/*

17218

++ * If we expire a queue that is waiting for the arrival of a new

17219

++ * request, we may prevent the fictitious timestamp backshifting that

17220

++ * allows the guarantees of the queue to be preserved (see [1] for

17221

++ * this tricky aspect). Hence we return true only if this condition

17222

++ * does not hold, or if the queue is slow enough to deserve only to be

17223

++ * kicked off for preserving a high throughput.

17224

++*/

17225

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

17226

++{

17227

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

17228

++		"may_budget_timeout: wr %d left %d timeout %d",

17229

++		bfq_bfqq_wait_request(bfqq),

17230

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

17231

++		bfq_bfqq_budget_timeout(bfqq));

17232

++

17233

++	return (!bfq_bfqq_wait_request(bfqq) ||

17234

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

17235

++		&&

17236

++		bfq_bfqq_budget_timeout(bfqq);

17237

++}

17238

++

17239

++/*

17240

++ * For weight-raised queues issuing sync requests, idling is always performed,

17241

++ * as this is instrumental in guaranteeing a high fraction of the throughput

17242

++ * to these queues, and hence in guaranteeing a lower latency for their

17243

++ * requests. See [1] for details.

17244

++ *

17245

++ * For non-weight-raised queues, idling is instead disabled if the device is

17246

++ * NCQ-enabled and non-rotational, as this boosts the throughput on such

17247

++ * devices.

17248

++ */

17249

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

17250

++{

17251

++	struct bfq_data *bfqd = bfqq->bfqd;

17252

++

17253

++	return bfq_bfqq_sync(bfqq) && (

17254

++		bfqq->raising_coeff > 1 ||

17255

++		(bfq_bfqq_idle_window(bfqq) &&

17256

++		 !(bfqd->hw_tag &&

17257

++		   (blk_queue_nonrot(bfqd->queue) ||

17258

++		 /*

17259

++		  * If there are weight-raised busy queues, then do not idle

17260

++		  * the disk for a sync non-weight-raised queue, and hence

17261

++		  * expire the queue immediately if empty. Combined with the

17262

++		  * timestamping rules of BFQ (see [1] for details), this

17263

++		  * causes sync non-weight-raised queues to get a lower

17264

++		  * fraction of the disk throughput, and hence reduces the rate

17265

++		  * at which the processes associated to these queues ask for

17266

++		  * requests from the request pool.

17267

++		  *

17268

++		  * This is beneficial for weight-raised processes, when the

17269

++		  * system operates in request-pool saturation conditions

17270

++		  * (e.g., in the presence of write hogs). In fact, if

17271

++		  * non-weight-raised processes ask for requests at a lower

17272

++		  * rate, then weight-raised processes have a higher

17273

++		  * probability to get a request from the pool immediately

17274

++		  * (or at least soon) when they need one. Hence they have a

17275

++		  * higher probability to actually get a fraction of the disk

17276

++		  * throughput proportional to their high weight. This is

17277

++		  * especially true with NCQ-enabled drives, which enqueue

17278

++		  * several requests in advance and further reorder

17279

++		  * internally-queued requests.

17280

++		  *

17281

++		  * Mistreating non-weight-raised queues in the above-described

17282

++		  * way, when there are busy weight-raised queues, seems to

17283

++		  * mitigate starvation problems in the presence of heavy write

17284

++		  * workloads and NCQ, and hence to guarantee a higher

17285

++		  * application and system responsiveness in these hostile

17286

++		  * scenarios.

17287

++		  */

17288

++		    bfqd->raised_busy_queues > 0)

17289

++		  )

17290

++		)

17291

++	);

17292

++}

17293

++

17294

++/*

17295

++ * If the in-service queue is empty, but it is sync and either of the following

17296

++ * conditions holds, then: 1) the queue must remain in service and cannot be

17297

++ * expired, and 2) the disk must be idled to wait for the possible arrival

17298

++ * of a new request for the queue. The conditions are:

17299

++ * - the device is rotational and not performing NCQ, and the queue has its

17300

++ *   idle window set (in this case, waiting for a new request for the queue

17301

++ *   is likely to boost the disk throughput);

17302

++ * - the queue is weight-raised (waiting for the request is necessary to

17303

++ *   provide the queue with fairness and latency guarantees, see [1] for

17304

++ *   details).

17305

++ */

17306

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

17307

++{

17308

++	struct bfq_data *bfqd = bfqq->bfqd;

17309

++

17310

++	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

17311

++		bfq_bfqq_must_not_expire(bfqq) &&

17312

++		!bfq_queue_nonrot_noidle(bfqd, bfqq));

17313

++}

17314

++

17315

++/*

17316

++ * Select a queue for service.  If we have a current queue in service,

17317

++ * check whether to continue servicing it, or retrieve and set a new one.

17318

++ */

17319

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

17320

++{

17321

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

17322

++	struct request *next_rq;

17323

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

17324

++

17325

++	bfqq = bfqd->in_service_queue;

17326

++	if (bfqq == NULL)

17327

++		goto new_queue;

17328

++

17329

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

17330

++

17331

++	/*

17332

++         * If another queue has a request waiting within our mean seek

17333

++         * distance, let it run. The expire code will check for close

17334

++         * cooperators and put the close queue at the front of the

17335

++         * service tree. If possible, merge the expiring queue with the

17336

++         * new bfqq.

17337

++         */

17338

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

17339

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

17340

++                bfq_setup_merge(bfqq, new_bfqq);

17341

++

17342

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

17343

++	    !timer_pending(&bfqd->idle_slice_timer) &&

17344

++	    !bfq_bfqq_must_idle(bfqq))

17345

++		goto expire;

17346

++

17347

++	next_rq = bfqq->next_rq;

17348

++	/*

17349

++	 * If bfqq has requests queued and it has enough budget left to

17350

++	 * serve them, keep the queue, otherwise expire it.

17351

++	 */

17352

++	if (next_rq != NULL) {

17353

++		if (bfq_serv_to_charge(next_rq, bfqq) >

17354

++			bfq_bfqq_budget_left(bfqq)) {

17355

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

17356

++			goto expire;

17357

++		} else {

17358

++			/*

17359

++			 * The idle timer may be pending because we may not

17360

++			 * disable disk idling even when a new request arrives

17361

++			 */

17362

++			if (timer_pending(&bfqd->idle_slice_timer)) {

17363

++				/*

17364

++				 * If we get here: 1) at least a new request

17365

++				 * has arrived but we have not disabled the

17366

++				 * timer because the request was too small,

17367

++				 * 2) then the block layer has unplugged the

17368

++				 * device, causing the dispatch to be invoked.

17369

++				 *

17370

++				 * Since the device is unplugged, now the

17371

++				 * requests are probably large enough to

17372

++				 * provide a reasonable throughput.

17373

++				 * So we disable idling.

17374

++				 */

17375

++				bfq_clear_bfqq_wait_request(bfqq);

17376

++				del_timer(&bfqd->idle_slice_timer);

17377

++			}

17378

++			if (new_bfqq == NULL)

17379

++				goto keep_queue;

17380

++			else

17381

++				goto expire;

17382

++		}

17383

++	}

17384

++

17385

++	/*

17386

++	 * No requests pending.  If the in-service queue has no cooperator and

17387

++	 * still has requests in flight (possibly waiting for a completion)

17388

++	 * or is idling for a new request, then keep it.

17389

++	 */

17390

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

17391

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

17392

++		bfqq = NULL;

17393

++		goto keep_queue;

17394

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

17395

++		/*

17396

++		 * Expiring the queue because there is a close cooperator,

17397

++		 * cancel timer.

17398

++		 */

17399

++		bfq_clear_bfqq_wait_request(bfqq);

17400

++		del_timer(&bfqd->idle_slice_timer);

17401

++	}

17402

++

17403

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

17404

++expire:

17405

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

17406

++new_queue:

17407

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

17408

++	bfq_log(bfqd, "select_queue: new queue %d returned",

17409

++		bfqq != NULL ? bfqq->pid : 0);

17410

++keep_queue:

17411

++	return bfqq;

17412

++}

17413

++

17414

++static void bfq_update_raising_data(struct bfq_data *bfqd,

17415

++				    struct bfq_queue *bfqq)

17416

++{

17417

++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

17418

++		struct bfq_entity *entity = &bfqq->entity;

17419

++

17420

++		bfq_log_bfqq(bfqd, bfqq,

17421

++			"raising period dur %u/%u msec, "

17422

++			"old raising coeff %u, w %d(%d)",

17423

++			jiffies_to_msecs(jiffies -

17424

++				bfqq->last_rais_start_finish),

17425

++			jiffies_to_msecs(bfqq->raising_cur_max_time),

17426

++			bfqq->raising_coeff,

17427

++			bfqq->entity.weight, bfqq->entity.orig_weight);

17428

++

17429

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

17430

++			entity->orig_weight * bfqq->raising_coeff);

17431

++		if (entity->ioprio_changed)

17432

++			bfq_log_bfqq(bfqd, bfqq,

17433

++			"WARN: pending prio change");

17434

++		/*

17435

++		 * If too much time has elapsed from the beginning

17436

++		 * of this weight-raising, stop it.

17437

++		 */

17438

++		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

17439

++					   bfqq->raising_cur_max_time)) {

17440

++			bfqq->last_rais_start_finish = jiffies;

17441

++			bfq_log_bfqq(bfqd, bfqq,

17442

++				     "wrais ending at %lu, "

17443

++				     "rais_max_time %u",

17444

++				     bfqq->last_rais_start_finish,

17445

++				     jiffies_to_msecs(bfqq->

17446

++					raising_cur_max_time));

17447

++			bfq_bfqq_end_raising(bfqq);

17448

++			__bfq_entity_update_weight_prio(

17449

++				bfq_entity_service_tree(entity),

17450

++				entity);

17451

++		}

17452

++	}

17453

++}

17454

++

17455

++/*

17456

++ * Dispatch one request from bfqq, moving it to the request queue

17457

++ * dispatch list.

17458

++ */

17459

++static int bfq_dispatch_request(struct bfq_data *bfqd,

17460

++				struct bfq_queue *bfqq)

17461

++{

17462

++	int dispatched = 0;

17463

++	struct request *rq;

17464

++	unsigned long service_to_charge;

17465

++

17466

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

17467

++

17468

++	/* Follow expired path, else get first next available. */

17469

++	rq = bfq_check_fifo(bfqq);

17470

++	if (rq == NULL)

17471

++		rq = bfqq->next_rq;

17472

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

17473

++

17474

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

17475

++		/*

17476

++		 * This may happen if the next rq is chosen

17477

++		 * in fifo order instead of sector order.

17478

++		 * The budget is properly dimensioned

17479

++		 * to be always sufficient to serve the next request

17480

++		 * only if it is chosen in sector order. The reason is

17481

++		 * that it would be quite inefficient and little useful

17482

++		 * to always make sure that the budget is large enough

17483

++		 * to serve even the possible next rq in fifo order.

17484

++		 * In fact, requests are seldom served in fifo order.

17485

++		 *

17486

++		 * Expire the queue for budget exhaustion, and

17487

++		 * make sure that the next act_budget is enough

17488

++		 * to serve the next request, even if it comes

17489

++		 * from the fifo expired path.

17490

++		 */

17491

++		bfqq->next_rq = rq;

17492

++		/*

17493

++		 * Since this dispatch is failed, make sure that

17494

++		 * a new one will be performed

17495

++		 */

17496

++		if (!bfqd->rq_in_driver)

17497

++			bfq_schedule_dispatch(bfqd);

17498

++		goto expire;

17499

++	}

17500

++

17501

++	/* Finally, insert request into driver dispatch list. */

17502

++	bfq_bfqq_served(bfqq, service_to_charge);

17503

++	bfq_dispatch_insert(bfqd->queue, rq);

17504

++

17505

++	bfq_update_raising_data(bfqd, bfqq);

17506

++

17507

++	bfq_log_bfqq(bfqd, bfqq,

17508

++			"dispatched %u sec req (%llu), budg left %lu",

17509

++			blk_rq_sectors(rq),

17510

++			(long long unsigned)blk_rq_pos(rq),

17511

++			bfq_bfqq_budget_left(bfqq));

17512

++

17513

++	dispatched++;

17514

++

17515

++	if (bfqd->in_service_bic == NULL) {

17516

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

17517

++		bfqd->in_service_bic = RQ_BIC(rq);

17518

++	}

17519

++

17520

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

17521

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

17522

++	    bfq_class_idle(bfqq)))

17523

++		goto expire;

17524

++

17525

++	return dispatched;

17526

++

17527

++expire:

17528

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

17529

++	return dispatched;

17530

++}

17531

++

17532

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

17533

++{

17534

++	int dispatched = 0;

17535

++

17536

++	while (bfqq->next_rq != NULL) {

17537

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

17538

++		dispatched++;

17539

++	}

17540

++

17541

++	BUG_ON(!list_empty(&bfqq->fifo));

17542

++	return dispatched;

17543

++}

17544

++

17545

++/*

17546

++ * Drain our current requests.  Used for barriers and when switching

17547

++ * io schedulers on-the-fly.

17548

++ */

17549

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

17550

++{

17551

++	struct bfq_queue *bfqq, *n;

17552

++	struct bfq_service_tree *st;

17553

++	int dispatched = 0;

17554

++

17555

++	bfqq = bfqd->in_service_queue;

17556

++	if (bfqq != NULL)

17557

++		__bfq_bfqq_expire(bfqd, bfqq);

17558

++

17559

++	/*

17560

++	 * Loop through classes, and be careful to leave the scheduler

17561

++	 * in a consistent state, as feedback mechanisms and vtime

17562

++	 * updates cannot be disabled during the process.

17563

++	 */

17564

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

17565

++		st = bfq_entity_service_tree(&bfqq->entity);

17566

++

17567

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

17568

++		bfqq->max_budget = bfq_max_budget(bfqd);

17569

++

17570

++		bfq_forget_idle(st);

17571

++	}

17572

++

17573

++	BUG_ON(bfqd->busy_queues != 0);

17574

++

17575

++	return dispatched;

17576

++}

17577

++

17578

++static int bfq_dispatch_requests(struct request_queue *q, int force)

17579

++{

17580

++	struct bfq_data *bfqd = q->elevator->elevator_data;

17581

++	struct bfq_queue *bfqq;

17582

++	int max_dispatch;

17583

++

17584

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

17585

++	if (bfqd->busy_queues == 0)

17586

++		return 0;

17587

++

17588

++	if (unlikely(force))

17589

++		return bfq_forced_dispatch(bfqd);

17590

++

17591

++	bfqq = bfq_select_queue(bfqd);

17592

++	if (bfqq == NULL)

17593

++		return 0;

17594

++

17595

++	max_dispatch = bfqd->bfq_quantum;

17596

++	if (bfq_class_idle(bfqq))

17597

++		max_dispatch = 1;

17598

++

17599

++	if (!bfq_bfqq_sync(bfqq))

17600

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

17601

++

17602

++	if (bfqq->dispatched >= max_dispatch) {

17603

++		if (bfqd->busy_queues > 1)

17604

++			return 0;

17605

++		if (bfqq->dispatched >= 4 * max_dispatch)

17606

++			return 0;

17607

++	}

17608

++

17609

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

17610

++		return 0;

17611

++

17612

++	bfq_clear_bfqq_wait_request(bfqq);

17613

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

17614

++

17615

++	if (!bfq_dispatch_request(bfqd, bfqq))

17616

++		return 0;

17617

++

17618

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

17619

++			bfqq->pid, max_dispatch);

17620

++

17621

++	return 1;

17622

++}

17623

++

17624

++/*

17625

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

17626

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

17627

++ *

17628

++ * Queue lock must be held here.

17629

++ */

17630

++static void bfq_put_queue(struct bfq_queue *bfqq)

17631

++{

17632

++	struct bfq_data *bfqd = bfqq->bfqd;

17633

++

17634

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

17635

++

17636

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

17637

++		     atomic_read(&bfqq->ref));

17638

++	if (!atomic_dec_and_test(&bfqq->ref))

17639

++		return;

17640

++

17641

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

17642

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

17643

++	BUG_ON(bfqq->entity.tree != NULL);

17644

++	BUG_ON(bfq_bfqq_busy(bfqq));

17645

++	BUG_ON(bfqd->in_service_queue == bfqq);

17646

++

17647

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

17648

++

17649

++	kmem_cache_free(bfq_pool, bfqq);

17650

++}

17651

++

17652

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

17653

++{

17654

++	struct bfq_queue *__bfqq, *next;

17655

++

17656

++	/*

17657

++	 * If this queue was scheduled to merge with another queue, be

17658

++	 * sure to drop the reference taken on that queue (and others in

17659

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

17660

++	 */

17661

++	__bfqq = bfqq->new_bfqq;

17662

++	while (__bfqq) {

17663

++		if (__bfqq == bfqq) {

17664

++			WARN(1, "bfqq->new_bfqq loop detected.\n");

17665

++			break;

17666

++		}

17667

++		next = __bfqq->new_bfqq;

17668

++		bfq_put_queue(__bfqq);

17669

++		__bfqq = next;

17670

++	}

17671

++}

17672

++

17673

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

17674

++{

17675

++	if (bfqq == bfqd->in_service_queue) {

17676

++		__bfq_bfqq_expire(bfqd, bfqq);

17677

++		bfq_schedule_dispatch(bfqd);

17678

++	}

17679

++

17680

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

17681

++		     atomic_read(&bfqq->ref));

17682

++

17683

++	bfq_put_cooperator(bfqq);

17684

++

17685

++	bfq_put_queue(bfqq);

17686

++}

17687

++

17688

++static void bfq_init_icq(struct io_cq *icq)

17689

++{

17690

++	struct bfq_io_cq *bic = icq_to_bic(icq);

17691

++

17692

++	bic->ttime.last_end_request = jiffies;

17693

++}

17694

++

17695

++static void bfq_exit_icq(struct io_cq *icq)

17696

++{

17697

++	struct bfq_io_cq *bic = icq_to_bic(icq);

17698

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

17699

++

17700

++	if (bic->bfqq[BLK_RW_ASYNC]) {

17701

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

17702

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

17703

++	}

17704

++

17705

++	if (bic->bfqq[BLK_RW_SYNC]) {

17706

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

17707

++		bic->bfqq[BLK_RW_SYNC] = NULL;

17708

++	}

17709

++}

17710

++

17711

++/*

17712

++ * Update the entity prio values; note that the new values will not

17713

++ * be used until the next (re)activation.

17714

++ */

17715

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

17716

++{

17717

++	struct task_struct *tsk = current;

17718

++	int ioprio_class;

17719

++

17720

++	if (!bfq_bfqq_prio_changed(bfqq))

17721

++		return;

17722

++

17723

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

17724

++	switch (ioprio_class) {

17725

++	default:

17726

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

17727

++			"bfq: bad prio %x\n", ioprio_class);

17728

++	case IOPRIO_CLASS_NONE:

17729

++		/*

17730

++		 * No prio set, inherit CPU scheduling settings.

17731

++		 */

17732

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

17733

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

17734

++		break;

17735

++	case IOPRIO_CLASS_RT:

17736

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

17737

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

17738

++		break;

17739

++	case IOPRIO_CLASS_BE:

17740

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

17741

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

17742

++		break;

17743

++	case IOPRIO_CLASS_IDLE:

17744

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

17745

++		bfqq->entity.new_ioprio = 7;

17746

++		bfq_clear_bfqq_idle_window(bfqq);

17747

++		break;

17748

++	}

17749

++

17750

++	bfqq->entity.ioprio_changed = 1;

17751

++

17752

++	/*

17753

++	 * Keep track of original prio settings in case we have to temporarily

17754

++	 * elevate the priority of this queue.

17755

++	 */

17756

++	bfqq->org_ioprio = bfqq->entity.new_ioprio;

17757

++	bfq_clear_bfqq_prio_changed(bfqq);

17758

++}

17759

++

17760

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

17761

++{

17762

++	struct bfq_data *bfqd;

17763

++	struct bfq_queue *bfqq, *new_bfqq;

17764

++	struct bfq_group *bfqg;

17765

++	unsigned long uninitialized_var(flags);

17766

++	int ioprio = bic->icq.ioc->ioprio;

17767

++

17768

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

17769

++				   &flags);

17770

++	/*

17771

++	 * This condition may trigger on a newly created bic, be sure to drop

17772

++	 * the lock before returning.

17773

++	 */

17774

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

17775

++		goto out;

17776

++

17777

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

17778

++	if (bfqq != NULL) {

17779

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

17780

++				    sched_data);

17781

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

17782

++					 GFP_ATOMIC);

17783

++		if (new_bfqq != NULL) {

17784

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

17785

++			bfq_log_bfqq(bfqd, bfqq,

17786

++				     "changed_ioprio: bfqq %p %d",

17787

++				     bfqq, atomic_read(&bfqq->ref));

17788

++			bfq_put_queue(bfqq);

17789

++		}

17790

++	}

17791

++

17792

++	bfqq = bic->bfqq[BLK_RW_SYNC];

17793

++	if (bfqq != NULL)

17794

++		bfq_mark_bfqq_prio_changed(bfqq);

17795

++

17796

++	bic->ioprio = ioprio;

17797

++

17798

++out:

17799

++	bfq_put_bfqd_unlock(bfqd, &flags);

17800

++}

17801

++

17802

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

17803

++			  pid_t pid, int is_sync)

17804

++{

17805

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

17806

++	INIT_LIST_HEAD(&bfqq->fifo);

17807

++

17808

++	atomic_set(&bfqq->ref, 0);

17809

++	bfqq->bfqd = bfqd;

17810

++

17811

++	bfq_mark_bfqq_prio_changed(bfqq);

17812

++

17813

++	if (is_sync) {

17814

++		if (!bfq_class_idle(bfqq))

17815

++			bfq_mark_bfqq_idle_window(bfqq);

17816

++		bfq_mark_bfqq_sync(bfqq);

17817

++	}

17818

++

17819

++	/* Tentative initial value to trade off between thr and lat */

17820

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

17821

++	bfqq->pid = pid;

17822

++

17823

++	bfqq->raising_coeff = 1;

17824

++	bfqq->last_rais_start_finish = 0;

17825

++	/*

17826

++	 * Set to the value for which bfqq will not be deemed as

17827

++	 * soft rt when it becomes backlogged.

17828

++	 */

17829

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

17830

++}

17831

++

17832

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

17833

++					      struct bfq_group *bfqg,

17834

++					      int is_sync,

17835

++					      struct bfq_io_cq *bic,

17836

++					      gfp_t gfp_mask)

17837

++{

17838

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

17839

++

17840

++retry:

17841

++	/* bic always exists here */

17842

++	bfqq = bic_to_bfqq(bic, is_sync);

17843

++

17844

++	/*

17845

++	 * Always try a new alloc if we fall back to the OOM bfqq

17846

++	 * originally, since it should just be a temporary situation.

17847

++	 */

17848

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

17849

++		bfqq = NULL;

17850

++		if (new_bfqq != NULL) {

17851

++			bfqq = new_bfqq;

17852

++			new_bfqq = NULL;

17853

++		} else if (gfp_mask & __GFP_WAIT) {

17854

++			spin_unlock_irq(bfqd->queue->queue_lock);

17855

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

17856

++					gfp_mask | __GFP_ZERO,

17857

++					bfqd->queue->node);

17858

++			spin_lock_irq(bfqd->queue->queue_lock);

17859

++			if (new_bfqq != NULL)

17860

++				goto retry;

17861

++		} else {

17862

++			bfqq = kmem_cache_alloc_node(bfq_pool,

17863

++					gfp_mask | __GFP_ZERO,

17864

++					bfqd->queue->node);

17865

++		}

17866

++

17867

++		if (bfqq != NULL) {

17868

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

17869

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

17870

++		} else {

17871

++			bfqq = &bfqd->oom_bfqq;

17872

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

17873

++		}

17874

++

17875

++		bfq_init_prio_data(bfqq, bic);

17876

++		bfq_init_entity(&bfqq->entity, bfqg);

17877

++	}

17878

++

17879

++	if (new_bfqq != NULL)

17880

++		kmem_cache_free(bfq_pool, new_bfqq);

17881

++

17882

++	return bfqq;

17883

++}

17884

++

17885

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

17886

++					       struct bfq_group *bfqg,

17887

++					       int ioprio_class, int ioprio)

17888

++{

17889

++	switch (ioprio_class) {

17890

++	case IOPRIO_CLASS_RT:

17891

++		return &bfqg->async_bfqq[0][ioprio];

17892

++	case IOPRIO_CLASS_NONE:

17893

++		ioprio = IOPRIO_NORM;

17894

++		/* fall through */

17895

++	case IOPRIO_CLASS_BE:

17896

++		return &bfqg->async_bfqq[1][ioprio];

17897

++	case IOPRIO_CLASS_IDLE:

17898

++		return &bfqg->async_idle_bfqq;

17899

++	default:

17900

++		BUG();

17901

++	}

17902

++}

17903

++

17904

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

17905

++				       struct bfq_group *bfqg, int is_sync,

17906

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

17907

++{

17908

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

17909

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

17910

++	struct bfq_queue **async_bfqq = NULL;

17911

++	struct bfq_queue *bfqq = NULL;

17912

++

17913

++	if (!is_sync) {

17914

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

17915

++						  ioprio);

17916

++		bfqq = *async_bfqq;

17917

++	}

17918

++

17919

++	if (bfqq == NULL)

17920

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

17921

++

17922

++	/*

17923

++	 * Pin the queue now that it's allocated, scheduler exit will prune it.

17924

++	 */

17925

++	if (!is_sync && *async_bfqq == NULL) {

17926

++		atomic_inc(&bfqq->ref);

17927

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

17928

++			     bfqq, atomic_read(&bfqq->ref));

17929

++		*async_bfqq = bfqq;

17930

++	}

17931

++

17932

++	atomic_inc(&bfqq->ref);

17933

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

17934

++		     atomic_read(&bfqq->ref));

17935

++	return bfqq;

17936

++}

17937

++

17938

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

17939

++				    struct bfq_io_cq *bic)

17940

++{

17941

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

17942

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

17943

++

17944

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

17945

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

17946

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

17947

++				bic->ttime.ttime_samples;

17948

++}

17949

++

17950

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

17951

++				   struct bfq_queue *bfqq,

17952

++				   struct request *rq)

17953

++{

17954

++	sector_t sdist;

17955

++	u64 total;

17956

++

17957

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

17958

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

17959

++	else

17960

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

17961

++

17962

++	/*

17963

++	 * Don't allow the seek distance to get too large from the

17964

++	 * odd fragment, pagein, etc.

17965

++	 */

17966

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

17967

++		sdist = 0;

17968

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

17969

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

17970

++	else

17971

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

17972

++

17973

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

17974

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

17975

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

17976

++	do_div(total, bfqq->seek_samples);

17977

++	bfqq->seek_mean = (sector_t)total;

17978

++

17979

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

17980

++			(u64)bfqq->seek_mean);

17981

++}

17982

++

17983

++/*

17984

++ * Disable idle window if the process thinks too long or seeks so much that

17985

++ * it doesn't matter.

17986

++ */

17987

++static void bfq_update_idle_window(struct bfq_data *bfqd,

17988

++				   struct bfq_queue *bfqq,

17989

++				   struct bfq_io_cq *bic)

17990

++{

17991

++	int enable_idle;

17992

++

17993

++	/* Don't idle for async or idle io prio class. */

17994

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

17995

++		return;

17996

++

17997

++	enable_idle = bfq_bfqq_idle_window(bfqq);

17998

++

17999

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

18000

++	    bfqd->bfq_slice_idle == 0 ||

18001

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

18002

++			bfqq->raising_coeff == 1))

18003

++		enable_idle = 0;

18004

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

18005

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

18006

++			bfqq->raising_coeff == 1)

18007

++			enable_idle = 0;

18008

++		else

18009

++			enable_idle = 1;

18010

++	}

18011

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

18012

++		enable_idle);

18013

++

18014

++	if (enable_idle)

18015

++		bfq_mark_bfqq_idle_window(bfqq);

18016

++	else

18017

++		bfq_clear_bfqq_idle_window(bfqq);

18018

++}

18019

++

18020

++/*

18021

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

18022

++ * something we should do about it.

18023

++ */

18024

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

18025

++			    struct request *rq)

18026

++{

18027

++	struct bfq_io_cq *bic = RQ_BIC(rq);

18028

++

18029

++	if (rq->cmd_flags & REQ_META)

18030

++		bfqq->meta_pending++;

18031

++

18032

++	bfq_update_io_thinktime(bfqd, bic);

18033

++	bfq_update_io_seektime(bfqd, bfqq, rq);

18034

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

18035

++	    !BFQQ_SEEKY(bfqq))

18036

++		bfq_update_idle_window(bfqd, bfqq, bic);

18037

++

18038

++	bfq_log_bfqq(bfqd, bfqq,

18039

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

18040

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

18041

++		     (long long unsigned)bfqq->seek_mean);

18042

++

18043

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

18044

++

18045

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

18046

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

18047

++				blk_rq_sectors(rq) < 32;

18048

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

18049

++

18050

++		/*

18051

++		 * There is just this request queued: if the request

18052

++		 * is small and the queue is not to be expired, then

18053

++		 * just exit.

18054

++		 *

18055

++		 * In this way, if the disk is being idled to wait for

18056

++		 * a new request from the in-service queue, we avoid

18057

++		 * unplugging the device and committing the disk to serve

18058

++		 * just a small request. On the contrary, we wait for

18059

++		 * the block layer to decide when to unplug the device:

18060

++		 * hopefully, new requests will be merged to this one

18061

++		 * quickly, then the device will be unplugged and

18062

++		 * larger requests will be dispatched.

18063

++		 */

18064

++		if (small_req && !budget_timeout)

18065

++			return;

18066

++

18067

++		/*

18068

++		 * A large enough request arrived, or the queue is to

18069

++		 * be expired: in both cases disk idling is to be

18070

++		 * stopped, so clear wait_request flag and reset

18071

++		 * timer.

18072

++		 */

18073

++		bfq_clear_bfqq_wait_request(bfqq);

18074

++		del_timer(&bfqd->idle_slice_timer);

18075

++

18076

++		/*

18077

++		 * The queue is not empty, because a new request just

18078

++		 * arrived. Hence we can safely expire the queue, in

18079

++		 * case of budget timeout, without risking that the

18080

++		 * timestamps of the queue are not updated correctly.

18081

++		 * See [1] for more details.

18082

++		 */

18083

++		if (budget_timeout)

18084

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

18085

++

18086

++		/*

18087

++		 * Let the request rip immediately, or let a new queue be

18088

++		 * selected if bfqq has just been expired.

18089

++		 */

18090

++		__blk_run_queue(bfqd->queue);

18091

++	}

18092

++}

18093

++

18094

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

18095

++{

18096

++	struct bfq_data *bfqd = q->elevator->elevator_data;

18097

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

18098

++

18099

++	assert_spin_locked(bfqd->queue->queue_lock);

18100

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

18101

++

18102

++	bfq_add_rq_rb(rq);

18103

++

18104

++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

18105

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

18106

++

18107

++	bfq_rq_enqueued(bfqd, bfqq, rq);

18108

++}

18109

++

18110

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

18111

++{

18112

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

18113

++				     bfqd->rq_in_driver);

18114

++

18115

++	if (bfqd->hw_tag == 1)

18116

++		return;

18117

++

18118

++	/*

18119

++	 * This sample is valid if the number of outstanding requests

18120

++	 * is large enough to allow a queueing behavior.  Note that the

18121

++	 * sum is not exact, as it's not taking into account deactivated

18122

++	 * requests.

18123

++	 */

18124

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

18125

++		return;

18126

++

18127

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

18128

++		return;

18129

++

18130

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

18131

++	bfqd->max_rq_in_driver = 0;

18132

++	bfqd->hw_tag_samples = 0;

18133

++}

18134

++

18135

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

18136

++{

18137

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

18138

++	struct bfq_data *bfqd = bfqq->bfqd;

18139

++	const int sync = rq_is_sync(rq);

18140

++

18141

++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

18142

++			blk_rq_sectors(rq), sync);

18143

++

18144

++	bfq_update_hw_tag(bfqd);

18145

++

18146

++	WARN_ON(!bfqd->rq_in_driver);

18147

++	WARN_ON(!bfqq->dispatched);

18148

++	bfqd->rq_in_driver--;

18149

++	bfqq->dispatched--;

18150

++

18151

++	if (bfq_bfqq_sync(bfqq))

18152

++		bfqd->sync_flight--;

18153

++

18154

++	if (sync)

18155

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

18156

++

18157

++	/*

18158

++	 * The computation of softrt_next_start was scheduled for the next

18159

++	 * request completion: it is now time to compute it.

18160

++	 */

18161

++	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

18162

++		bfqq->soft_rt_next_start =

18163

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

18164

++

18165

++	/*

18166

++	 * If this is the in-service queue, check if it needs to be expired,

18167

++	 * or if we want to idle in case it has no pending requests.

18168

++	 */

18169

++	if (bfqd->in_service_queue == bfqq) {

18170

++		if (bfq_bfqq_budget_new(bfqq))

18171

++			bfq_set_budget_timeout(bfqd);

18172

++

18173

++		if (bfq_bfqq_must_idle(bfqq)) {

18174

++			bfq_arm_slice_timer(bfqd);

18175

++			goto out;

18176

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

18177

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

18178

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

18179

++			 (bfqq->dispatched == 0 ||

18180

++			  !bfq_bfqq_must_not_expire(bfqq)))

18181

++			bfq_bfqq_expire(bfqd, bfqq, 0,

18182

++					BFQ_BFQQ_NO_MORE_REQUESTS);

18183

++	}

18184

++

18185

++	if (!bfqd->rq_in_driver)

18186

++		bfq_schedule_dispatch(bfqd);

18187

++

18188

++out:

18189

++	return;

18190

++}

18191

++

18192

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

18193

++{

18194

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

18195

++		bfq_clear_bfqq_must_alloc(bfqq);

18196

++		return ELV_MQUEUE_MUST;

18197

++	}

18198

++

18199

++	return ELV_MQUEUE_MAY;

18200

++}

18201

++

18202

++static int bfq_may_queue(struct request_queue *q, int rw)

18203

++{

18204

++	struct bfq_data *bfqd = q->elevator->elevator_data;

18205

++	struct task_struct *tsk = current;

18206

++	struct bfq_io_cq *bic;

18207

++	struct bfq_queue *bfqq;

18208

++

18209

++	/*

18210

++	 * Don't force setup of a queue from here, as a call to may_queue

18211

++	 * does not necessarily imply that a request actually will be queued.

18212

++	 * So just lookup a possibly existing queue, or return 'may queue'

18213

++	 * if that fails.

18214

++	 */

18215

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

18216

++	if (bic == NULL)

18217

++		return ELV_MQUEUE_MAY;

18218

++

18219

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

18220

++	if (bfqq != NULL) {

18221

++		bfq_init_prio_data(bfqq, bic);

18222

++

18223

++		return __bfq_may_queue(bfqq);

18224

++	}

18225

++

18226

++	return ELV_MQUEUE_MAY;

18227

++}

18228

++

18229

++/*

18230

++ * Queue lock held here.

18231

++ */

18232

++static void bfq_put_request(struct request *rq)

18233

++{

18234

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

18235

++

18236

++	if (bfqq != NULL) {

18237

++		const int rw = rq_data_dir(rq);

18238

++

18239

++		BUG_ON(!bfqq->allocated[rw]);

18240

++		bfqq->allocated[rw]--;

18241

++

18242

++		rq->elv.priv[0] = NULL;

18243

++		rq->elv.priv[1] = NULL;

18244

++

18245

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

18246

++			     bfqq, atomic_read(&bfqq->ref));

18247

++		bfq_put_queue(bfqq);

18248

++	}

18249

++}

18250

++

18251

++static struct bfq_queue *

18252

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

18253

++		struct bfq_queue *bfqq)

18254

++{

18255

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

18256

++		(long unsigned)bfqq->new_bfqq->pid);

18257

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

18258

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

18259

++	bfq_put_queue(bfqq);

18260

++	return bic_to_bfqq(bic, 1);

18261

++}

18262

++

18263

++/*

18264

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

18265

++ * was the last process referring to said bfqq.

18266

++ */

18267

++static struct bfq_queue *

18268

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

18269

++{

18270

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

18271

++	if (bfqq_process_refs(bfqq) == 1) {

18272

++		bfqq->pid = current->pid;

18273

++		bfq_clear_bfqq_coop(bfqq);

18274

++		bfq_clear_bfqq_split_coop(bfqq);

18275

++		return bfqq;

18276

++	}

18277

++

18278

++	bic_set_bfqq(bic, NULL, 1);

18279

++

18280

++	bfq_put_cooperator(bfqq);

18281

++

18282

++	bfq_put_queue(bfqq);

18283

++	return NULL;

18284

++}

18285

++

18286

++/*

18287

++ * Allocate bfq data structures associated with this request.

18288

++ */

18289

++static int bfq_set_request(struct request_queue *q, struct request *rq,

18290

++			   struct bio *bio, gfp_t gfp_mask)

18291

++{

18292

++	struct bfq_data *bfqd = q->elevator->elevator_data;

18293

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

18294

++	const int rw = rq_data_dir(rq);

18295

++	const int is_sync = rq_is_sync(rq);

18296

++	struct bfq_queue *bfqq;

18297

++	struct bfq_group *bfqg;

18298

++	unsigned long flags;

18299

++

18300

++	might_sleep_if(gfp_mask & __GFP_WAIT);

18301

++

18302

++	bfq_changed_ioprio(bic);

18303

++

18304

++	spin_lock_irqsave(q->queue_lock, flags);

18305

++

18306

++	if (bic == NULL)

18307

++		goto queue_fail;

18308

++

18309

++	bfqg = bfq_bic_update_cgroup(bic);

18310

++

18311

++new_queue:

18312

++	bfqq = bic_to_bfqq(bic, is_sync);

18313

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

18314

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

18315

++		bic_set_bfqq(bic, bfqq, is_sync);

18316

++	} else {

18317

++		/*

18318

++		 * If the queue was seeky for too long, break it apart.

18319

++		 */

18320

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

18321

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

18322

++			bfqq = bfq_split_bfqq(bic, bfqq);

18323

++			if (!bfqq)

18324

++				goto new_queue;

18325

++		}

18326

++

18327

++		/*

18328

++		 * Check to see if this queue is scheduled to merge with

18329

++		 * another closely cooperating queue. The merging of queues

18330

++		 * happens here as it must be done in process context.

18331

++		 * The reference on new_bfqq was taken in merge_bfqqs.

18332

++		 */

18333

++		if (bfqq->new_bfqq != NULL)

18334

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

18335

++	}

18336

++

18337

++	bfqq->allocated[rw]++;

18338

++	atomic_inc(&bfqq->ref);

18339

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

18340

++		     atomic_read(&bfqq->ref));

18341

++

18342

++	rq->elv.priv[0] = bic;

18343

++	rq->elv.priv[1] = bfqq;

18344

++

18345

++	spin_unlock_irqrestore(q->queue_lock, flags);

18346

++

18347

++	return 0;

18348

++

18349

++queue_fail:

18350

++	bfq_schedule_dispatch(bfqd);

18351

++	spin_unlock_irqrestore(q->queue_lock, flags);

18352

++

18353

++	return 1;

18354

++}

18355

++

18356

++static void bfq_kick_queue(struct work_struct *work)

18357

++{

18358

++	struct bfq_data *bfqd =

18359

++		container_of(work, struct bfq_data, unplug_work);

18360

++	struct request_queue *q = bfqd->queue;

18361

++

18362

++	spin_lock_irq(q->queue_lock);

18363

++	__blk_run_queue(q);

18364

++	spin_unlock_irq(q->queue_lock);

18365

++}

18366

++

18367

++/*

18368

++ * Handler of the expiration of the timer running if the in-service queue

18369

++ * is idling inside its time slice.

18370

++ */

18371

++static void bfq_idle_slice_timer(unsigned long data)

18372

++{

18373

++	struct bfq_data *bfqd = (struct bfq_data *)data;

18374

++	struct bfq_queue *bfqq;

18375

++	unsigned long flags;

18376

++	enum bfqq_expiration reason;

18377

++

18378

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

18379

++

18380

++	bfqq = bfqd->in_service_queue;

18381

++	/*

18382

++	 * Theoretical race here: the in-service queue can be NULL or different

18383

++	 * from the queue that was idling if the timer handler spins on

18384

++	 * the queue_lock and a new request arrives for the current

18385

++	 * queue and there is a full dispatch cycle that changes the

18386

++	 * in-service queue.  This can hardly happen, but in the worst case

18387

++	 * we just expire a queue too early.

18388

++	 */

18389

++	if (bfqq != NULL) {

18390

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

18391

++		if (bfq_bfqq_budget_timeout(bfqq))

18392

++			/*

18393

++			 * Also here the queue can be safely expired

18394

++			 * for budget timeout without wasting

18395

++			 * guarantees

18396

++			 */

18397

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

18398

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

18399

++			/*

18400

++			 * The queue may not be empty upon timer expiration,

18401

++			 * because we may not disable the timer when the first

18402

++			 * request of the in-service queue arrives during

18403

++			 * disk idling

18404

++			 */

18405

++			reason = BFQ_BFQQ_TOO_IDLE;

18406

++		else

18407

++			goto schedule_dispatch;

18408

++

18409

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

18410

++	}

18411

++

18412

++schedule_dispatch:

18413

++	bfq_schedule_dispatch(bfqd);

18414

++

18415

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

18416

++}

18417

++

18418

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

18419

++{

18420

++	del_timer_sync(&bfqd->idle_slice_timer);

18421

++	cancel_work_sync(&bfqd->unplug_work);

18422

++}

18423

++

18424

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

18425

++					struct bfq_queue **bfqq_ptr)

18426

++{

18427

++	struct bfq_group *root_group = bfqd->root_group;

18428

++	struct bfq_queue *bfqq = *bfqq_ptr;

18429

++

18430

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

18431

++	if (bfqq != NULL) {

18432

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

18433

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

18434

++			     bfqq, atomic_read(&bfqq->ref));

18435

++		bfq_put_queue(bfqq);

18436

++		*bfqq_ptr = NULL;

18437

++	}

18438

++}

18439

++

18440

++/*

18441

++ * Release all the bfqg references to its async queues.  If we are

18442

++ * deallocating the group these queues may still contain requests, so

18443

++ * we reparent them to the root cgroup (i.e., the only one that will

18444

++ * exist for sure untill all the requests on a device are gone).

18445

++ */

18446

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

18447

++{

18448

++	int i, j;

18449

++

18450

++	for (i = 0; i < 2; i++)

18451

++		for (j = 0; j < IOPRIO_BE_NR; j++)

18452

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

18453

++

18454

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

18455

++}

18456

++

18457

++static void bfq_exit_queue(struct elevator_queue *e)

18458

++{

18459

++	struct bfq_data *bfqd = e->elevator_data;

18460

++	struct request_queue *q = bfqd->queue;

18461

++	struct bfq_queue *bfqq, *n;

18462

++

18463

++	bfq_shutdown_timer_wq(bfqd);

18464

++

18465

++	spin_lock_irq(q->queue_lock);

18466

++

18467

++	BUG_ON(bfqd->in_service_queue != NULL);

18468

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

18469

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

18470

++

18471

++	bfq_disconnect_groups(bfqd);

18472

++	spin_unlock_irq(q->queue_lock);

18473

++

18474

++	bfq_shutdown_timer_wq(bfqd);

18475

++

18476

++	synchronize_rcu();

18477

++

18478

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

18479

++

18480

++	bfq_free_root_group(bfqd);

18481

++	kfree(bfqd);

18482

++}

18483

++

18484

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

18485

++{

18486

++	struct bfq_group *bfqg;

18487

++	struct bfq_data *bfqd;

18488

++	struct elevator_queue *eq;

18489

++

18490

++	eq = elevator_alloc(q, e);

18491

++	if (eq == NULL)

18492

++		return -ENOMEM;

18493

++

18494

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

18495

++	if (bfqd == NULL) {

18496

++		kobject_put(&eq->kobj);

18497

++		return -ENOMEM;

18498

++	}

18499

++	eq->elevator_data = bfqd;

18500

++

18501

++	/*

18502

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

18503

++	 * Grab a permanent reference to it, so that the normal code flow

18504

++	 * will not attempt to free it.

18505

++	 */

18506

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

18507

++	atomic_inc(&bfqd->oom_bfqq.ref);

18508

++

18509

++	bfqd->queue = q;

18510

++

18511

++	spin_lock_irq(q->queue_lock);

18512

++	q->elevator = eq;

18513

++	spin_unlock_irq(q->queue_lock);

18514

++

18515

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

18516

++	if (bfqg == NULL) {

18517

++		kfree(bfqd);

18518

++		kobject_put(&eq->kobj);

18519

++		return -ENOMEM;

18520

++	}

18521

++

18522

++	bfqd->root_group = bfqg;

18523

++

18524

++	init_timer(&bfqd->idle_slice_timer);

18525

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

18526

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

18527

++

18528

++	bfqd->rq_pos_tree = RB_ROOT;

18529

++

18530

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

18531

++

18532

++	INIT_LIST_HEAD(&bfqd->active_list);

18533

++	INIT_LIST_HEAD(&bfqd->idle_list);

18534

++

18535

++	bfqd->hw_tag = -1;

18536

++

18537

++	bfqd->bfq_max_budget = bfq_default_max_budget;

18538

++

18539

++	bfqd->bfq_quantum = bfq_quantum;

18540

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

18541

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

18542

++	bfqd->bfq_back_max = bfq_back_max;

18543

++	bfqd->bfq_back_penalty = bfq_back_penalty;

18544

++	bfqd->bfq_slice_idle = bfq_slice_idle;

18545

++	bfqd->bfq_class_idle_last_service = 0;

18546

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

18547

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

18548

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

18549

++

18550

++	bfqd->low_latency = true;

18551

++

18552

++	bfqd->bfq_raising_coeff = 20;

18553

++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

18554

++	bfqd->bfq_raising_max_time = 0;

18555

++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

18556

++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

18557

++	bfqd->bfq_raising_max_softrt_rate = 7000; /*

18558

++						   * Approximate rate required

18559

++						   * to playback or record a

18560

++						   * high-definition compressed

18561

++						   * video.

18562

++						   */

18563

++	bfqd->raised_busy_queues = 0;

18564

++

18565

++	/* Initially estimate the device's peak rate as the reference rate */

18566

++	if (blk_queue_nonrot(bfqd->queue)) {

18567

++		bfqd->RT_prod = R_nonrot * T_nonrot;

18568

++		bfqd->peak_rate = R_nonrot;

18569

++	} else {

18570

++		bfqd->RT_prod = R_rot * T_rot;

18571

++		bfqd->peak_rate = R_rot;

18572

++	}

18573

++

18574

++	return 0;

18575

++}

18576

++

18577

++static void bfq_slab_kill(void)

18578

++{

18579

++	if (bfq_pool != NULL)

18580

++		kmem_cache_destroy(bfq_pool);

18581

++}

18582

++

18583

++static int __init bfq_slab_setup(void)

18584

++{

18585

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

18586

++	if (bfq_pool == NULL)

18587

++		return -ENOMEM;

18588

++	return 0;

18589

++}

18590

++

18591

++static ssize_t bfq_var_show(unsigned int var, char *page)

18592

++{

18593

++	return sprintf(page, "%d\n", var);

18594

++}

18595

++

18596

++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

18597

++{

18598

++	unsigned long new_val;

18599

++	int ret = kstrtoul(page, 10, &new_val);

18600

++

18601

++	if (ret == 0)

18602

++		*var = new_val;

18603

++

18604

++	return count;

18605

++}

18606

++

18607

++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

18608

++{

18609

++	struct bfq_data *bfqd = e->elevator_data;

18610

++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

18611

++		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

18612

++		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

18613

++}

18614

++

18615

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

18616

++{

18617

++	struct bfq_queue *bfqq;

18618

++	struct bfq_data *bfqd = e->elevator_data;

18619

++	ssize_t num_char = 0;

18620

++

18621

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

18622

++			    bfqd->queued);

18623

++

18624

++	spin_lock_irq(bfqd->queue->queue_lock);

18625

++

18626

++	num_char += sprintf(page + num_char, "Active:\n");

18627

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

18628

++	  num_char += sprintf(page + num_char,

18629

++			      "pid%d: weight %hu, nr_queued %d %d,"

18630

++			      " dur %d/%u\n",

18631

++			      bfqq->pid,

18632

++			      bfqq->entity.weight,

18633

++			      bfqq->queued[0],

18634

++			      bfqq->queued[1],

18635

++			jiffies_to_msecs(jiffies -

18636

++				bfqq->last_rais_start_finish),

18637

++			jiffies_to_msecs(bfqq->raising_cur_max_time));

18638

++	}

18639

++

18640

++	num_char += sprintf(page + num_char, "Idle:\n");

18641

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

18642

++			num_char += sprintf(page + num_char,

18643

++				"pid%d: weight %hu, dur %d/%u\n",

18644

++				bfqq->pid,

18645

++				bfqq->entity.weight,

18646

++				jiffies_to_msecs(jiffies -

18647

++					bfqq->last_rais_start_finish),

18648

++				jiffies_to_msecs(bfqq->raising_cur_max_time));

18649

++	}

18650

++

18651

++	spin_unlock_irq(bfqd->queue->queue_lock);

18652

++

18653

++	return num_char;

18654

++}

18655

++

18656

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

18657

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

18658

++{									\

18659

++	struct bfq_data *bfqd = e->elevator_data;			\

18660

++	unsigned int __data = __VAR;					\

18661

++	if (__CONV)							\

18662

++		__data = jiffies_to_msecs(__data);			\

18663

++	return bfq_var_show(__data, (page));				\

18664

++}

18665

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

18666

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

18667

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

18668

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

18669

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

18670

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

18671

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

18672

++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

18673

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

18674

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

18675

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

18676

++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

18677

++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

18678

++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

18679

++	1);

18680

++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

18681

++	bfqd->bfq_raising_min_inter_arr_async,

18682

++	1);

18683

++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

18684

++	bfqd->bfq_raising_max_softrt_rate, 0);

18685

++#undef SHOW_FUNCTION

18686

++

18687

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

18688

++static ssize_t								\

18689

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

18690

++{									\

18691

++	struct bfq_data *bfqd = e->elevator_data;			\

18692

++	unsigned long uninitialized_var(__data);			\

18693

++	int ret = bfq_var_store(&__data, (page), count);		\

18694

++	if (__data < (MIN))						\

18695

++		__data = (MIN);						\

18696

++	else if (__data > (MAX))					\

18697

++		__data = (MAX);						\

18698

++	if (__CONV)							\

18699

++		*(__PTR) = msecs_to_jiffies(__data);			\

18700

++	else								\

18701

++		*(__PTR) = __data;					\

18702

++	return ret;							\

18703

++}

18704

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

18705

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

18706

++		INT_MAX, 1);

18707

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

18708

++		INT_MAX, 1);

18709

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

18710

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

18711

++		INT_MAX, 0);

18712

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

18713

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

18714

++		1, INT_MAX, 0);

18715

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

18716

++		INT_MAX, 1);

18717

++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

18718

++		INT_MAX, 0);

18719

++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

18720

++		INT_MAX, 1);

18721

++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

18722

++		INT_MAX, 1);

18723

++STORE_FUNCTION(bfq_raising_min_idle_time_store,

18724

++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

18725

++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

18726

++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

18727

++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

18728

++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

18729

++#undef STORE_FUNCTION

18730

++

18731

++/* do nothing for the moment */

18732

++static ssize_t bfq_weights_store(struct elevator_queue *e,

18733

++				    const char *page, size_t count)

18734

++{

18735

++	return count;

18736

++}

18737

++

18738

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

18739

++{

18740

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

18741

++

18742

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

18743

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

18744

++	else

18745

++		return bfq_default_max_budget;

18746

++}

18747

++

18748

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

18749

++				    const char *page, size_t count)

18750

++{

18751

++	struct bfq_data *bfqd = e->elevator_data;

18752

++	unsigned long uninitialized_var(__data);

18753

++	int ret = bfq_var_store(&__data, (page), count);

18754

++

18755

++	if (__data == 0)

18756

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

18757

++	else {

18758

++		if (__data > INT_MAX)

18759

++			__data = INT_MAX;

18760

++		bfqd->bfq_max_budget = __data;

18761

++	}

18762

++

18763

++	bfqd->bfq_user_max_budget = __data;

18764

++

18765

++	return ret;

18766

++}

18767

++

18768

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

18769

++				      const char *page, size_t count)

18770

++{

18771

++	struct bfq_data *bfqd = e->elevator_data;

18772

++	unsigned long uninitialized_var(__data);

18773

++	int ret = bfq_var_store(&__data, (page), count);

18774

++

18775

++	if (__data < 1)

18776

++		__data = 1;

18777

++	else if (__data > INT_MAX)

18778

++		__data = INT_MAX;

18779

++

18780

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

18781

++	if (bfqd->bfq_user_max_budget == 0)

18782

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

18783

++

18784

++	return ret;

18785

++}

18786

++

18787

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

18788

++				     const char *page, size_t count)

18789

++{

18790

++	struct bfq_data *bfqd = e->elevator_data;

18791

++	unsigned long uninitialized_var(__data);

18792

++	int ret = bfq_var_store(&__data, (page), count);

18793

++

18794

++	if (__data > 1)

18795

++		__data = 1;

18796

++	if (__data == 0 && bfqd->low_latency != 0)

18797

++		bfq_end_raising(bfqd);

18798

++	bfqd->low_latency = __data;

18799

++

18800

++	return ret;

18801

++}

18802

++

18803

++#define BFQ_ATTR(name) \

18804

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

18805

++

18806

++static struct elv_fs_entry bfq_attrs[] = {

18807

++	BFQ_ATTR(quantum),

18808

++	BFQ_ATTR(fifo_expire_sync),

18809

++	BFQ_ATTR(fifo_expire_async),

18810

++	BFQ_ATTR(back_seek_max),

18811

++	BFQ_ATTR(back_seek_penalty),

18812

++	BFQ_ATTR(slice_idle),

18813

++	BFQ_ATTR(max_budget),

18814

++	BFQ_ATTR(max_budget_async_rq),

18815

++	BFQ_ATTR(timeout_sync),

18816

++	BFQ_ATTR(timeout_async),

18817

++	BFQ_ATTR(low_latency),

18818

++	BFQ_ATTR(raising_coeff),

18819

++	BFQ_ATTR(raising_max_time),

18820

++	BFQ_ATTR(raising_rt_max_time),

18821

++	BFQ_ATTR(raising_min_idle_time),

18822

++	BFQ_ATTR(raising_min_inter_arr_async),

18823

++	BFQ_ATTR(raising_max_softrt_rate),

18824

++	BFQ_ATTR(weights),

18825

++	__ATTR_NULL

18826

++};

18827

++

18828

++static struct elevator_type iosched_bfq = {

18829

++	.ops = {

18830

++		.elevator_merge_fn =		bfq_merge,

18831

++		.elevator_merged_fn =		bfq_merged_request,

18832

++		.elevator_merge_req_fn =	bfq_merged_requests,

18833

++		.elevator_allow_merge_fn =	bfq_allow_merge,

18834

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

18835

++		.elevator_add_req_fn =		bfq_insert_request,

18836

++		.elevator_activate_req_fn =	bfq_activate_request,

18837

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

18838

++		.elevator_completed_req_fn =	bfq_completed_request,

18839

++		.elevator_former_req_fn =	elv_rb_former_request,

18840

++		.elevator_latter_req_fn =	elv_rb_latter_request,

18841

++		.elevator_init_icq_fn =		bfq_init_icq,

18842

++		.elevator_exit_icq_fn =		bfq_exit_icq,

18843

++		.elevator_set_req_fn =		bfq_set_request,

18844

++		.elevator_put_req_fn =		bfq_put_request,

18845

++		.elevator_may_queue_fn =	bfq_may_queue,

18846

++		.elevator_init_fn =		bfq_init_queue,

18847

++		.elevator_exit_fn =		bfq_exit_queue,

18848

++	},

18849

++	.icq_size =		sizeof(struct bfq_io_cq),

18850

++	.icq_align =		__alignof__(struct bfq_io_cq),

18851

++	.elevator_attrs =	bfq_attrs,

18852

++	.elevator_name =	"bfq",

18853

++	.elevator_owner =	THIS_MODULE,

18854

++};

18855

++

18856

++static int __init bfq_init(void)

18857

++{

18858

++	/*

18859

++	 * Can be 0 on HZ < 1000 setups.

18860

++	 */

18861

++	if (bfq_slice_idle == 0)

18862

++		bfq_slice_idle = 1;

18863

++

18864

++	if (bfq_timeout_async == 0)

18865

++		bfq_timeout_async = 1;

18866

++

18867

++	if (bfq_slab_setup())

18868

++		return -ENOMEM;

18869

++

18870

++	elv_register(&iosched_bfq);

18871

++	printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");

18872

++

18873

++	return 0;

18874

++}

18875

++

18876

++static void __exit bfq_exit(void)

18877

++{

18878

++	elv_unregister(&iosched_bfq);

18879

++	bfq_slab_kill();

18880

++}

18881

++

18882

++module_init(bfq_init);

18883

++module_exit(bfq_exit);

18884

++

18885

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

18886

++MODULE_LICENSE("GPL");

18887

++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

18888

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

18889

+new file mode 100644

18890

+index 0000000..999b475

18891

+--- /dev/null

18892

++++ b/block/bfq-sched.c

18893

+@@ -0,0 +1,1078 @@

18894

++/*

18895

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

18896

++ *

18897

++ * Based on ideas and code from CFQ:

18898

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

18899

++ *

18900

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

18901

++ *		      Paolo Valente <paolo.valente@×××××××.it>

18902

++ *

18903

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

18904

++ */

18905

++

18906

++#ifdef CONFIG_CGROUP_BFQIO

18907

++#define for_each_entity(entity)	\

18908

++	for (; entity != NULL; entity = entity->parent)

18909

++

18910

++#define for_each_entity_safe(entity, parent) \

18911

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

18912

++

18913

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

18914

++						 int extract,

18915

++						 struct bfq_data *bfqd);

18916

++

18917

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

18918

++{

18919

++	struct bfq_entity *bfqg_entity;

18920

++	struct bfq_group *bfqg;

18921

++	struct bfq_sched_data *group_sd;

18922

++

18923

++	BUG_ON(next_in_service == NULL);

18924

++

18925

++	group_sd = next_in_service->sched_data;

18926

++

18927

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

18928

++	/*

18929

++	 * bfq_group's my_entity field is not NULL only if the group

18930

++	 * is not the root group. We must not touch the root entity

18931

++	 * as it must never become an in-service entity.

18932

++	 */

18933

++	bfqg_entity = bfqg->my_entity;

18934

++	if (bfqg_entity != NULL)

18935

++		bfqg_entity->budget = next_in_service->budget;

18936

++}

18937

++

18938

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

18939

++{

18940

++	struct bfq_entity *next_in_service;

18941

++

18942

++	if (sd->in_service_entity != NULL)

18943

++		/* will update/requeue at the end of service */

18944

++		return 0;

18945

++

18946

++	/*

18947

++	 * NOTE: this can be improved in many ways, such as returning

18948

++	 * 1 (and thus propagating upwards the update) only when the

18949

++	 * budget changes, or caching the bfqq that will be scheduled

18950

++	 * next from this subtree.  By now we worry more about

18951

++	 * correctness than about performance...

18952

++	 */

18953

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

18954

++	sd->next_in_service = next_in_service;

18955

++

18956

++	if (next_in_service != NULL)

18957

++		bfq_update_budget(next_in_service);

18958

++

18959

++	return 1;

18960

++}

18961

++

18962

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

18963

++					     struct bfq_entity *entity)

18964

++{

18965

++	BUG_ON(sd->next_in_service != entity);

18966

++}

18967

++#else

18968

++#define for_each_entity(entity)	\

18969

++	for (; entity != NULL; entity = NULL)

18970

++

18971

++#define for_each_entity_safe(entity, parent) \

18972

++	for (parent = NULL; entity != NULL; entity = parent)

18973

++

18974

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

18975

++{

18976

++	return 0;

18977

++}

18978

++

18979

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

18980

++					     struct bfq_entity *entity)

18981

++{

18982

++}

18983

++

18984

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

18985

++{

18986

++}

18987

++#endif

18988

++

18989

++/*

18990

++ * Shift for timestamp calculations.  This actually limits the maximum

18991

++ * service allowed in one timestamp delta (small shift values increase it),

18992

++ * the maximum total weight that can be used for the queues in the system

18993

++ * (big shift values increase it), and the period of virtual time wraparounds.

18994

++ */

18995

++#define WFQ_SERVICE_SHIFT	22

18996

++

18997

++/**

18998

++ * bfq_gt - compare two timestamps.

18999

++ * @a: first ts.

19000

++ * @b: second ts.

19001

++ *

19002

++ * Return @a > @b, dealing with wrapping correctly.

19003

++ */

19004

++static inline int bfq_gt(u64 a, u64 b)

19005

++{

19006

++	return (s64)(a - b) > 0;

19007

++}

19008

++

19009

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

19010

++{

19011

++	struct bfq_queue *bfqq = NULL;

19012

++

19013

++	BUG_ON(entity == NULL);

19014

++

19015

++	if (entity->my_sched_data == NULL)

19016

++		bfqq = container_of(entity, struct bfq_queue, entity);

19017

++

19018

++	return bfqq;

19019

++}

19020

++

19021

++

19022

++/**

19023

++ * bfq_delta - map service into the virtual time domain.

19024

++ * @service: amount of service.

19025

++ * @weight: scale factor (weight of an entity or weight sum).

19026

++ */

19027

++static inline u64 bfq_delta(unsigned long service,

19028

++					unsigned long weight)

19029

++{

19030

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

19031

++

19032

++	do_div(d, weight);

19033

++	return d;

19034

++}

19035

++

19036

++/**

19037

++ * bfq_calc_finish - assign the finish time to an entity.

19038

++ * @entity: the entity to act upon.

19039

++ * @service: the service to be charged to the entity.

19040

++ */

19041

++static inline void bfq_calc_finish(struct bfq_entity *entity,

19042

++				   unsigned long service)

19043

++{

19044

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19045

++

19046

++	BUG_ON(entity->weight == 0);

19047

++

19048

++	entity->finish = entity->start +

19049

++		bfq_delta(service, entity->weight);

19050

++

19051

++	if (bfqq != NULL) {

19052

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

19053

++			"calc_finish: serv %lu, w %d",

19054

++			service, entity->weight);

19055

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

19056

++			"calc_finish: start %llu, finish %llu, delta %llu",

19057

++			entity->start, entity->finish,

19058

++			bfq_delta(service, entity->weight));

19059

++	}

19060

++}

19061

++

19062

++/**

19063

++ * bfq_entity_of - get an entity from a node.

19064

++ * @node: the node field of the entity.

19065

++ *

19066

++ * Convert a node pointer to the relative entity.  This is used only

19067

++ * to simplify the logic of some functions and not as the generic

19068

++ * conversion mechanism because, e.g., in the tree walking functions,

19069

++ * the check for a %NULL value would be redundant.

19070

++ */

19071

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

19072

++{

19073

++	struct bfq_entity *entity = NULL;

19074

++

19075

++	if (node != NULL)

19076

++		entity = rb_entry(node, struct bfq_entity, rb_node);

19077

++

19078

++	return entity;

19079

++}

19080

++

19081

++/**

19082

++ * bfq_extract - remove an entity from a tree.

19083

++ * @root: the tree root.

19084

++ * @entity: the entity to remove.

19085

++ */

19086

++static inline void bfq_extract(struct rb_root *root,

19087

++			       struct bfq_entity *entity)

19088

++{

19089

++	BUG_ON(entity->tree != root);

19090

++

19091

++	entity->tree = NULL;

19092

++	rb_erase(&entity->rb_node, root);

19093

++}

19094

++

19095

++/**

19096

++ * bfq_idle_extract - extract an entity from the idle tree.

19097

++ * @st: the service tree of the owning @entity.

19098

++ * @entity: the entity being removed.

19099

++ */

19100

++static void bfq_idle_extract(struct bfq_service_tree *st,

19101

++			     struct bfq_entity *entity)

19102

++{

19103

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19104

++	struct rb_node *next;

19105

++

19106

++	BUG_ON(entity->tree != &st->idle);

19107

++

19108

++	if (entity == st->first_idle) {

19109

++		next = rb_next(&entity->rb_node);

19110

++		st->first_idle = bfq_entity_of(next);

19111

++	}

19112

++

19113

++	if (entity == st->last_idle) {

19114

++		next = rb_prev(&entity->rb_node);

19115

++		st->last_idle = bfq_entity_of(next);

19116

++	}

19117

++

19118

++	bfq_extract(&st->idle, entity);

19119

++

19120

++	if (bfqq != NULL)

19121

++		list_del(&bfqq->bfqq_list);

19122

++}

19123

++

19124

++/**

19125

++ * bfq_insert - generic tree insertion.

19126

++ * @root: tree root.

19127

++ * @entity: entity to insert.

19128

++ *

19129

++ * This is used for the idle and the active tree, since they are both

19130

++ * ordered by finish time.

19131

++ */

19132

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

19133

++{

19134

++	struct bfq_entity *entry;

19135

++	struct rb_node **node = &root->rb_node;

19136

++	struct rb_node *parent = NULL;

19137

++

19138

++	BUG_ON(entity->tree != NULL);

19139

++

19140

++	while (*node != NULL) {

19141

++		parent = *node;

19142

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

19143

++

19144

++		if (bfq_gt(entry->finish, entity->finish))

19145

++			node = &parent->rb_left;

19146

++		else

19147

++			node = &parent->rb_right;

19148

++	}

19149

++

19150

++	rb_link_node(&entity->rb_node, parent, node);

19151

++	rb_insert_color(&entity->rb_node, root);

19152

++

19153

++	entity->tree = root;

19154

++}

19155

++

19156

++/**

19157

++ * bfq_update_min - update the min_start field of a entity.

19158

++ * @entity: the entity to update.

19159

++ * @node: one of its children.

19160

++ *

19161

++ * This function is called when @entity may store an invalid value for

19162

++ * min_start due to updates to the active tree.  The function  assumes

19163

++ * that the subtree rooted at @node (which may be its left or its right

19164

++ * child) has a valid min_start value.

19165

++ */

19166

++static inline void bfq_update_min(struct bfq_entity *entity,

19167

++				  struct rb_node *node)

19168

++{

19169

++	struct bfq_entity *child;

19170

++

19171

++	if (node != NULL) {

19172

++		child = rb_entry(node, struct bfq_entity, rb_node);

19173

++		if (bfq_gt(entity->min_start, child->min_start))

19174

++			entity->min_start = child->min_start;

19175

++	}

19176

++}

19177

++

19178

++/**

19179

++ * bfq_update_active_node - recalculate min_start.

19180

++ * @node: the node to update.

19181

++ *

19182

++ * @node may have changed position or one of its children may have moved,

19183

++ * this function updates its min_start value.  The left and right subtrees

19184

++ * are assumed to hold a correct min_start value.

19185

++ */

19186

++static inline void bfq_update_active_node(struct rb_node *node)

19187

++{

19188

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

19189

++

19190

++	entity->min_start = entity->start;

19191

++	bfq_update_min(entity, node->rb_right);

19192

++	bfq_update_min(entity, node->rb_left);

19193

++}

19194

++

19195

++/**

19196

++ * bfq_update_active_tree - update min_start for the whole active tree.

19197

++ * @node: the starting node.

19198

++ *

19199

++ * @node must be the deepest modified node after an update.  This function

19200

++ * updates its min_start using the values held by its children, assuming

19201

++ * that they did not change, and then updates all the nodes that may have

19202

++ * changed in the path to the root.  The only nodes that may have changed

19203

++ * are the ones in the path or their siblings.

19204

++ */

19205

++static void bfq_update_active_tree(struct rb_node *node)

19206

++{

19207

++	struct rb_node *parent;

19208

++

19209

++up:

19210

++	bfq_update_active_node(node);

19211

++

19212

++	parent = rb_parent(node);

19213

++	if (parent == NULL)

19214

++		return;

19215

++

19216

++	if (node == parent->rb_left && parent->rb_right != NULL)

19217

++		bfq_update_active_node(parent->rb_right);

19218

++	else if (parent->rb_left != NULL)

19219

++		bfq_update_active_node(parent->rb_left);

19220

++

19221

++	node = parent;

19222

++	goto up;

19223

++}

19224

++

19225

++/**

19226

++ * bfq_active_insert - insert an entity in the active tree of its group/device.

19227

++ * @st: the service tree of the entity.

19228

++ * @entity: the entity being inserted.

19229

++ *

19230

++ * The active tree is ordered by finish time, but an extra key is kept

19231

++ * per each node, containing the minimum value for the start times of

19232

++ * its children (and the node itself), so it's possible to search for

19233

++ * the eligible node with the lowest finish time in logarithmic time.

19234

++ */

19235

++static void bfq_active_insert(struct bfq_service_tree *st,

19236

++			      struct bfq_entity *entity)

19237

++{

19238

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19239

++	struct rb_node *node = &entity->rb_node;

19240

++

19241

++	bfq_insert(&st->active, entity);

19242

++

19243

++	if (node->rb_left != NULL)

19244

++		node = node->rb_left;

19245

++	else if (node->rb_right != NULL)

19246

++		node = node->rb_right;

19247

++

19248

++	bfq_update_active_tree(node);

19249

++

19250

++	if (bfqq != NULL)

19251

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

19252

++}

19253

++

19254

++/**

19255

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

19256

++ * @ioprio: the ioprio value to convert.

19257

++ */

19258

++static unsigned short bfq_ioprio_to_weight(int ioprio)

19259

++{

19260

++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

19261

++	return IOPRIO_BE_NR - ioprio;

19262

++}

19263

++

19264

++/**

19265

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

19266

++ * @weight: the weight value to convert.

19267

++ *

19268

++ * To preserve as mush as possible the old only-ioprio user interface,

19269

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

19270

++ * larger than IOPRIO_BE_NR

19271

++ */

19272

++static unsigned short bfq_weight_to_ioprio(int weight)

19273

++{

19274

++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

19275

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

19276

++}

19277

++

19278

++static inline void bfq_get_entity(struct bfq_entity *entity)

19279

++{

19280

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19281

++	struct bfq_sched_data *sd;

19282

++

19283

++	if (bfqq != NULL) {

19284

++		sd = entity->sched_data;

19285

++		atomic_inc(&bfqq->ref);

19286

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

19287

++			     bfqq, atomic_read(&bfqq->ref));

19288

++	}

19289

++}

19290

++

19291

++/**

19292

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

19293

++ * @node: the node being removed.

19294

++ *

19295

++ * Do the first step of an extraction in an rb tree, looking for the

19296

++ * node that will replace @node, and returning the deepest node that

19297

++ * the following modifications to the tree can touch.  If @node is the

19298

++ * last node in the tree return %NULL.

19299

++ */

19300

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

19301

++{

19302

++	struct rb_node *deepest;

19303

++

19304

++	if (node->rb_right == NULL && node->rb_left == NULL)

19305

++		deepest = rb_parent(node);

19306

++	else if (node->rb_right == NULL)

19307

++		deepest = node->rb_left;

19308

++	else if (node->rb_left == NULL)

19309

++		deepest = node->rb_right;

19310

++	else {

19311

++		deepest = rb_next(node);

19312

++		if (deepest->rb_right != NULL)

19313

++			deepest = deepest->rb_right;

19314

++		else if (rb_parent(deepest) != node)

19315

++			deepest = rb_parent(deepest);

19316

++	}

19317

++

19318

++	return deepest;

19319

++}

19320

++

19321

++/**

19322

++ * bfq_active_extract - remove an entity from the active tree.

19323

++ * @st: the service_tree containing the tree.

19324

++ * @entity: the entity being removed.

19325

++ */

19326

++static void bfq_active_extract(struct bfq_service_tree *st,

19327

++			       struct bfq_entity *entity)

19328

++{

19329

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19330

++	struct rb_node *node;

19331

++

19332

++	node = bfq_find_deepest(&entity->rb_node);

19333

++	bfq_extract(&st->active, entity);

19334

++

19335

++	if (node != NULL)

19336

++		bfq_update_active_tree(node);

19337

++

19338

++	if (bfqq != NULL)

19339

++		list_del(&bfqq->bfqq_list);

19340

++}

19341

++

19342

++/**

19343

++ * bfq_idle_insert - insert an entity into the idle tree.

19344

++ * @st: the service tree containing the tree.

19345

++ * @entity: the entity to insert.

19346

++ */

19347

++static void bfq_idle_insert(struct bfq_service_tree *st,

19348

++			    struct bfq_entity *entity)

19349

++{

19350

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19351

++	struct bfq_entity *first_idle = st->first_idle;

19352

++	struct bfq_entity *last_idle = st->last_idle;

19353

++

19354

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

19355

++		st->first_idle = entity;

19356

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

19357

++		st->last_idle = entity;

19358

++

19359

++	bfq_insert(&st->idle, entity);

19360

++

19361

++	if (bfqq != NULL)

19362

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

19363

++}

19364

++

19365

++/**

19366

++ * bfq_forget_entity - remove an entity from the wfq trees.

19367

++ * @st: the service tree.

19368

++ * @entity: the entity being removed.

19369

++ *

19370

++ * Update the device status and forget everything about @entity, putting

19371

++ * the device reference to it, if it is a queue.  Entities belonging to

19372

++ * groups are not refcounted.

19373

++ */

19374

++static void bfq_forget_entity(struct bfq_service_tree *st,

19375

++			      struct bfq_entity *entity)

19376

++{

19377

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19378

++	struct bfq_sched_data *sd;

19379

++

19380

++	BUG_ON(!entity->on_st);

19381

++

19382

++	entity->on_st = 0;

19383

++	st->wsum -= entity->weight;

19384

++	if (bfqq != NULL) {

19385

++		sd = entity->sched_data;

19386

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

19387

++			     bfqq, atomic_read(&bfqq->ref));

19388

++		bfq_put_queue(bfqq);

19389

++	}

19390

++}

19391

++

19392

++/**

19393

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

19394

++ * @st: service tree for the entity.

19395

++ * @entity: the entity being released.

19396

++ */

19397

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

19398

++				struct bfq_entity *entity)

19399

++{

19400

++	bfq_idle_extract(st, entity);

19401

++	bfq_forget_entity(st, entity);

19402

++}

19403

++

19404

++/**

19405

++ * bfq_forget_idle - update the idle tree if necessary.

19406

++ * @st: the service tree to act upon.

19407

++ *

19408

++ * To preserve the global O(log N) complexity we only remove one entry here;

19409

++ * as the idle tree will not grow indefinitely this can be done safely.

19410

++ */

19411

++static void bfq_forget_idle(struct bfq_service_tree *st)

19412

++{

19413

++	struct bfq_entity *first_idle = st->first_idle;

19414

++	struct bfq_entity *last_idle = st->last_idle;

19415

++

19416

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

19417

++	    !bfq_gt(last_idle->finish, st->vtime)) {

19418

++		/*

19419

++		 * Forget the whole idle tree, increasing the vtime past

19420

++		 * the last finish time of idle entities.

19421

++		 */

19422

++		st->vtime = last_idle->finish;

19423

++	}

19424

++

19425

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

19426

++		bfq_put_idle_entity(st, first_idle);

19427

++}

19428

++

19429

++static struct bfq_service_tree *

19430

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

19431

++			 struct bfq_entity *entity)

19432

++{

19433

++	struct bfq_service_tree *new_st = old_st;

19434

++

19435

++	if (entity->ioprio_changed) {

19436

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

19437

++

19438

++		BUG_ON(old_st->wsum < entity->weight);

19439

++		old_st->wsum -= entity->weight;

19440

++

19441

++		if (entity->new_weight != entity->orig_weight) {

19442

++			entity->orig_weight = entity->new_weight;

19443

++			entity->ioprio =

19444

++				bfq_weight_to_ioprio(entity->orig_weight);

19445

++		} else if (entity->new_ioprio != entity->ioprio) {

19446

++			entity->ioprio = entity->new_ioprio;

19447

++			entity->orig_weight =

19448

++					bfq_ioprio_to_weight(entity->ioprio);

19449

++		} else

19450

++			entity->new_weight = entity->orig_weight =

19451

++				bfq_ioprio_to_weight(entity->ioprio);

19452

++

19453

++		entity->ioprio_class = entity->new_ioprio_class;

19454

++		entity->ioprio_changed = 0;

19455

++

19456

++		/*

19457

++		 * NOTE: here we may be changing the weight too early,

19458

++		 * this will cause unfairness.  The correct approach

19459

++		 * would have required additional complexity to defer

19460

++		 * weight changes to the proper time instants (i.e.,

19461

++		 * when entity->finish <= old_st->vtime).

19462

++		 */

19463

++		new_st = bfq_entity_service_tree(entity);

19464

++		entity->weight = entity->orig_weight *

19465

++			(bfqq != NULL ? bfqq->raising_coeff : 1);

19466

++		new_st->wsum += entity->weight;

19467

++

19468

++		if (new_st != old_st)

19469

++			entity->start = new_st->vtime;

19470

++	}

19471

++

19472

++	return new_st;

19473

++}

19474

++

19475

++/**

19476

++ * bfq_bfqq_served - update the scheduler status after selection for service.

19477

++ * @bfqq: the queue being served.

19478

++ * @served: bytes to transfer.

19479

++ *

19480

++ * NOTE: this can be optimized, as the timestamps of upper level entities

19481

++ * are synchronized every time a new bfqq is selected for service.  By now,

19482

++ * we keep it to better check consistency.

19483

++ */

19484

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

19485

++{

19486

++	struct bfq_entity *entity = &bfqq->entity;

19487

++	struct bfq_service_tree *st;

19488

++

19489

++	for_each_entity(entity) {

19490

++		st = bfq_entity_service_tree(entity);

19491

++

19492

++		entity->service += served;

19493

++		BUG_ON(entity->service > entity->budget);

19494

++		BUG_ON(st->wsum == 0);

19495

++

19496

++		st->vtime += bfq_delta(served, st->wsum);

19497

++		bfq_forget_idle(st);

19498

++	}

19499

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

19500

++}

19501

++

19502

++/**

19503

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

19504

++ * @bfqq: the queue that needs a service update.

19505

++ *

19506

++ * When it's not possible to be fair in the service domain, because

19507

++ * a queue is not consuming its budget fast enough (the meaning of

19508

++ * fast depends on the timeout parameter), we charge it a full

19509

++ * budget.  In this way we should obtain a sort of time-domain

19510

++ * fairness among all the seeky/slow queues.

19511

++ */

19512

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

19513

++{

19514

++	struct bfq_entity *entity = &bfqq->entity;

19515

++

19516

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

19517

++

19518

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

19519

++}

19520

++

19521

++/**

19522

++ * __bfq_activate_entity - activate an entity.

19523

++ * @entity: the entity being activated.

19524

++ *

19525

++ * Called whenever an entity is activated, i.e., it is not active and one

19526

++ * of its children receives a new request, or has to be reactivated due to

19527

++ * budget exhaustion.  It uses the current budget of the entity (and the

19528

++ * service received if @entity is active) of the queue to calculate its

19529

++ * timestamps.

19530

++ */

19531

++static void __bfq_activate_entity(struct bfq_entity *entity)

19532

++{

19533

++	struct bfq_sched_data *sd = entity->sched_data;

19534

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

19535

++

19536

++	if (entity == sd->in_service_entity) {

19537

++		BUG_ON(entity->tree != NULL);

19538

++		/*

19539

++		 * If we are requeueing the current entity we have

19540

++		 * to take care of not charging to it service it has

19541

++		 * not received.

19542

++		 */

19543

++		bfq_calc_finish(entity, entity->service);

19544

++		entity->start = entity->finish;

19545

++		sd->in_service_entity = NULL;

19546

++	} else if (entity->tree == &st->active) {

19547

++		/*

19548

++		 * Requeueing an entity due to a change of some

19549

++		 * next_in_service entity below it.  We reuse the

19550

++		 * old start time.

19551

++		 */

19552

++		bfq_active_extract(st, entity);

19553

++	} else if (entity->tree == &st->idle) {

19554

++		/*

19555

++		 * Must be on the idle tree, bfq_idle_extract() will

19556

++		 * check for that.

19557

++		 */

19558

++		bfq_idle_extract(st, entity);

19559

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

19560

++				       st->vtime : entity->finish;

19561

++	} else {

19562

++		/*

19563

++		 * The finish time of the entity may be invalid, and

19564

++		 * it is in the past for sure, otherwise the queue

19565

++		 * would have been on the idle tree.

19566

++		 */

19567

++		entity->start = st->vtime;

19568

++		st->wsum += entity->weight;

19569

++		bfq_get_entity(entity);

19570

++

19571

++		BUG_ON(entity->on_st);

19572

++		entity->on_st = 1;

19573

++	}

19574

++

19575

++	st = __bfq_entity_update_weight_prio(st, entity);

19576

++	bfq_calc_finish(entity, entity->budget);

19577

++	bfq_active_insert(st, entity);

19578

++}

19579

++

19580

++/**

19581

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

19582

++ * @entity: the entity to activate.

19583

++ *

19584

++ * Activate @entity and all the entities on the path from it to the root.

19585

++ */

19586

++static void bfq_activate_entity(struct bfq_entity *entity)

19587

++{

19588

++	struct bfq_sched_data *sd;

19589

++

19590

++	for_each_entity(entity) {

19591

++		__bfq_activate_entity(entity);

19592

++

19593

++		sd = entity->sched_data;

19594

++		if (!bfq_update_next_in_service(sd))

19595

++			/*

19596

++			 * No need to propagate the activation to the

19597

++			 * upper entities, as they will be updated when

19598

++			 * the in-service entity is rescheduled.

19599

++			 */

19600

++			break;

19601

++	}

19602

++}

19603

++

19604

++/**

19605

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

19606

++ * @entity: the entity to deactivate.

19607

++ * @requeue: if false, the entity will not be put into the idle tree.

19608

++ *

19609

++ * Deactivate an entity, independently from its previous state.  If the

19610

++ * entity was not on a service tree just return, otherwise if it is on

19611

++ * any scheduler tree, extract it from that tree, and if necessary

19612

++ * and if the caller did not specify @requeue, put it on the idle tree.

19613

++ *

19614

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

19615

++ * if the entity was under service or if it was the next_in_service for

19616

++ * its sched_data; return %0 otherwise.

19617

++ */

19618

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

19619

++{

19620

++	struct bfq_sched_data *sd = entity->sched_data;

19621

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

19622

++	int was_in_service = entity == sd->in_service_entity;

19623

++	int ret = 0;

19624

++

19625

++	if (!entity->on_st)

19626

++		return 0;

19627

++

19628

++	BUG_ON(was_in_service && entity->tree != NULL);

19629

++

19630

++	if (was_in_service) {

19631

++		bfq_calc_finish(entity, entity->service);

19632

++		sd->in_service_entity = NULL;

19633

++	} else if (entity->tree == &st->active)

19634

++		bfq_active_extract(st, entity);

19635

++	else if (entity->tree == &st->idle)

19636

++		bfq_idle_extract(st, entity);

19637

++	else if (entity->tree != NULL)

19638

++		BUG();

19639

++

19640

++	if (was_in_service || sd->next_in_service == entity)

19641

++		ret = bfq_update_next_in_service(sd);

19642

++

19643

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

19644

++		bfq_forget_entity(st, entity);

19645

++	else

19646

++		bfq_idle_insert(st, entity);

19647

++

19648

++	BUG_ON(sd->in_service_entity == entity);

19649

++	BUG_ON(sd->next_in_service == entity);

19650

++

19651

++	return ret;

19652

++}

19653

++

19654

++/**

19655

++ * bfq_deactivate_entity - deactivate an entity.

19656

++ * @entity: the entity to deactivate.

19657

++ * @requeue: true if the entity can be put on the idle tree

19658

++ */

19659

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

19660

++{

19661

++	struct bfq_sched_data *sd;

19662

++	struct bfq_entity *parent;

19663

++

19664

++	for_each_entity_safe(entity, parent) {

19665

++		sd = entity->sched_data;

19666

++

19667

++		if (!__bfq_deactivate_entity(entity, requeue))

19668

++			/*

19669

++			 * The parent entity is still backlogged, and

19670

++			 * we don't need to update it as it is still

19671

++			 * under service.

19672

++			 */

19673

++			break;

19674

++

19675

++		if (sd->next_in_service != NULL)

19676

++			/*

19677

++			 * The parent entity is still backlogged and

19678

++			 * the budgets on the path towards the root

19679

++			 * need to be updated.

19680

++			 */

19681

++			goto update;

19682

++

19683

++		/*

19684

++		 * If we reach there the parent is no more backlogged and

19685

++		 * we want to propagate the dequeue upwards.

19686

++		 */

19687

++		requeue = 1;

19688

++	}

19689

++

19690

++	return;

19691

++

19692

++update:

19693

++	entity = parent;

19694

++	for_each_entity(entity) {

19695

++		__bfq_activate_entity(entity);

19696

++

19697

++		sd = entity->sched_data;

19698

++		if (!bfq_update_next_in_service(sd))

19699

++			break;

19700

++	}

19701

++}

19702

++

19703

++/**

19704

++ * bfq_update_vtime - update vtime if necessary.

19705

++ * @st: the service tree to act upon.

19706

++ *

19707

++ * If necessary update the service tree vtime to have at least one

19708

++ * eligible entity, skipping to its start time.  Assumes that the

19709

++ * active tree of the device is not empty.

19710

++ *

19711

++ * NOTE: this hierarchical implementation updates vtimes quite often,

19712

++ * we may end up with reactivated tasks getting timestamps after a

19713

++ * vtime skip done because we needed a ->first_active entity on some

19714

++ * intermediate node.

19715

++ */

19716

++static void bfq_update_vtime(struct bfq_service_tree *st)

19717

++{

19718

++	struct bfq_entity *entry;

19719

++	struct rb_node *node = st->active.rb_node;

19720

++

19721

++	entry = rb_entry(node, struct bfq_entity, rb_node);

19722

++	if (bfq_gt(entry->min_start, st->vtime)) {

19723

++		st->vtime = entry->min_start;

19724

++		bfq_forget_idle(st);

19725

++	}

19726

++}

19727

++

19728

++/**

19729

++ * bfq_first_active_entity - find the eligible entity with

19730

++ *                           the smallest finish time

19731

++ * @st: the service tree to select from.

19732

++ *

19733

++ * This function searches the first schedulable entity, starting from the

19734

++ * root of the tree and going on the left every time on this side there is

19735

++ * a subtree with at least one eligible (start >= vtime) entity.  The path

19736

++ * on the right is followed only if a) the left subtree contains no eligible

19737

++ * entities and b) no eligible entity has been found yet.

19738

++ */

19739

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

19740

++{

19741

++	struct bfq_entity *entry, *first = NULL;

19742

++	struct rb_node *node = st->active.rb_node;

19743

++

19744

++	while (node != NULL) {

19745

++		entry = rb_entry(node, struct bfq_entity, rb_node);

19746

++left:

19747

++		if (!bfq_gt(entry->start, st->vtime))

19748

++			first = entry;

19749

++

19750

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

19751

++

19752

++		if (node->rb_left != NULL) {

19753

++			entry = rb_entry(node->rb_left,

19754

++					 struct bfq_entity, rb_node);

19755

++			if (!bfq_gt(entry->min_start, st->vtime)) {

19756

++				node = node->rb_left;

19757

++				goto left;

19758

++			}

19759

++		}

19760

++		if (first != NULL)

19761

++			break;

19762

++		node = node->rb_right;

19763

++	}

19764

++

19765

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

19766

++	return first;

19767

++}

19768

++

19769

++/**

19770

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

19771

++ * @st: the service tree.

19772

++ *

19773

++ * Update the virtual time in @st and return the first eligible entity

19774

++ * it contains.

19775

++ */

19776

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

19777

++						   bool force)

19778

++{

19779

++	struct bfq_entity *entity, *new_next_in_service = NULL;

19780

++

19781

++	if (RB_EMPTY_ROOT(&st->active))

19782

++		return NULL;

19783

++

19784

++	bfq_update_vtime(st);

19785

++	entity = bfq_first_active_entity(st);

19786

++	BUG_ON(bfq_gt(entity->start, st->vtime));

19787

++

19788

++	/*

19789

++	 * If the chosen entity does not match with the sched_data's

19790

++	 * next_in_service and we are forcedly serving the IDLE priority

19791

++	 * class tree, bubble up budget update.

19792

++	 */

19793

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

19794

++		new_next_in_service = entity;

19795

++		for_each_entity(new_next_in_service)

19796

++			bfq_update_budget(new_next_in_service);

19797

++	}

19798

++

19799

++	return entity;

19800

++}

19801

++

19802

++/**

19803

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

19804

++ * @sd: the sched_data.

19805

++ * @extract: if true the returned entity will be also extracted from @sd.

19806

++ *

19807

++ * NOTE: since we cache the next_in_service entity at each level of the

19808

++ * hierarchy, the complexity of the lookup can be decreased with

19809

++ * absolutely no effort just returning the cached next_in_service value;

19810

++ * we prefer to do full lookups to test the consistency of * the data

19811

++ * structures.

19812

++ */

19813

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

19814

++						 int extract,

19815

++						 struct bfq_data *bfqd)

19816

++{

19817

++	struct bfq_service_tree *st = sd->service_tree;

19818

++	struct bfq_entity *entity;

19819

++	int i = 0;

19820

++

19821

++	BUG_ON(sd->in_service_entity != NULL);

19822

++

19823

++	if (bfqd != NULL &&

19824

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

19825

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

19826

++						  true);

19827

++		if (entity != NULL) {

19828

++			i = BFQ_IOPRIO_CLASSES - 1;

19829

++			bfqd->bfq_class_idle_last_service = jiffies;

19830

++			sd->next_in_service = entity;

19831

++		}

19832

++	}

19833

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

19834

++		entity = __bfq_lookup_next_entity(st + i, false);

19835

++		if (entity != NULL) {

19836

++			if (extract) {

19837

++				bfq_check_next_in_service(sd, entity);

19838

++				bfq_active_extract(st + i, entity);

19839

++				sd->in_service_entity = entity;

19840

++				sd->next_in_service = NULL;

19841

++			}

19842

++			break;

19843

++		}

19844

++	}

19845

++

19846

++	return entity;

19847

++}

19848

++

19849

++/*

19850

++ * Get next queue for service.

19851

++ */

19852

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

19853

++{

19854

++	struct bfq_entity *entity = NULL;

19855

++	struct bfq_sched_data *sd;

19856

++	struct bfq_queue *bfqq;

19857

++

19858

++	BUG_ON(bfqd->in_service_queue != NULL);

19859

++

19860

++	if (bfqd->busy_queues == 0)

19861

++		return NULL;

19862

++

19863

++	sd = &bfqd->root_group->sched_data;

19864

++	for (; sd != NULL; sd = entity->my_sched_data) {

19865

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

19866

++		BUG_ON(entity == NULL);

19867

++		entity->service = 0;

19868

++	}

19869

++

19870

++	bfqq = bfq_entity_to_bfqq(entity);

19871

++	BUG_ON(bfqq == NULL);

19872

++

19873

++	return bfqq;

19874

++}

19875

++

19876

++/*

19877

++ * Forced extraction of the given queue.

19878

++ */

19879

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

19880

++				      struct bfq_queue *bfqq)

19881

++{

19882

++	struct bfq_entity *entity;

19883

++	struct bfq_sched_data *sd;

19884

++

19885

++	BUG_ON(bfqd->in_service_queue != NULL);

19886

++

19887

++	entity = &bfqq->entity;

19888

++	/*

19889

++	 * Bubble up extraction/update from the leaf to the root.

19890

++	*/

19891

++	for_each_entity(entity) {

19892

++		sd = entity->sched_data;

19893

++		bfq_update_budget(entity);

19894

++		bfq_update_vtime(bfq_entity_service_tree(entity));

19895

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

19896

++		sd->active_entity = entity;

19897

++		sd->next_active = NULL;

19898

++		entity->service = 0;

19899

++	}

19900

++

19901

++	return;

19902

++}

19903

++

19904

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

19905

++{

19906

++	if (bfqd->in_service_bic != NULL) {

19907

++		put_io_context(bfqd->in_service_bic->icq.ioc);

19908

++		bfqd->in_service_bic = NULL;

19909

++	}

19910

++

19911

++	bfqd->in_service_queue = NULL;

19912

++	del_timer(&bfqd->idle_slice_timer);

19913

++}

19914

++

19915

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

19916

++				int requeue)

19917

++{

19918

++	struct bfq_entity *entity = &bfqq->entity;

19919

++

19920

++	if (bfqq == bfqd->in_service_queue)

19921

++		__bfq_bfqd_reset_in_service(bfqd);

19922

++

19923

++	bfq_deactivate_entity(entity, requeue);

19924

++}

19925

++

19926

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

19927

++{

19928

++	struct bfq_entity *entity = &bfqq->entity;

19929

++

19930

++	bfq_activate_entity(entity);

19931

++}

19932

++

19933

++/*

19934

++ * Called when the bfqq no longer has requests pending, remove it from

19935

++ * the service tree.

19936

++ */

19937

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

19938

++			      int requeue)

19939

++{

19940

++	BUG_ON(!bfq_bfqq_busy(bfqq));

19941

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

19942

++

19943

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

19944

++

19945

++	bfq_clear_bfqq_busy(bfqq);

19946

++

19947

++	BUG_ON(bfqd->busy_queues == 0);

19948

++	bfqd->busy_queues--;

19949

++	if (bfqq->raising_coeff > 1)

19950

++		bfqd->raised_busy_queues--;

19951

++

19952

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

19953

++}

19954

++

19955

++/*

19956

++ * Called when an inactive queue receives a new request.

19957

++ */

19958

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

19959

++{

19960

++	BUG_ON(bfq_bfqq_busy(bfqq));

19961

++	BUG_ON(bfqq == bfqd->in_service_queue);

19962

++

19963

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

19964

++

19965

++	bfq_activate_bfqq(bfqd, bfqq);

19966

++

19967

++	bfq_mark_bfqq_busy(bfqq);

19968

++	bfqd->busy_queues++;

19969

++	if (bfqq->raising_coeff > 1)

19970

++		bfqd->raised_busy_queues++;

19971

++}

19972

+diff --git a/block/bfq.h b/block/bfq.h

19973

+new file mode 100644

19974

+index 0000000..f9b5881

19975

+--- /dev/null

19976

++++ b/block/bfq.h

19977

+@@ -0,0 +1,614 @@

19978

++/*

19979

++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.

19980

++ *

19981

++ * Based on ideas and code from CFQ:

19982

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

19983

++ *

19984

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

19985

++ *		      Paolo Valente <paolo.valente@×××××××.it>

19986

++ *

19987

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

19988

++ */

19989

++

19990

++#ifndef _BFQ_H

19991

++#define _BFQ_H

19992

++

19993

++#include <linux/blktrace_api.h>

19994

++#include <linux/hrtimer.h>

19995

++#include <linux/ioprio.h>

19996

++#include <linux/rbtree.h>

19997

++

19998

++#define BFQ_IOPRIO_CLASSES	3

19999

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

20000

++

20001

++#define BFQ_MIN_WEIGHT	1

20002

++#define BFQ_MAX_WEIGHT	1000

20003

++

20004

++#define BFQ_DEFAULT_GRP_WEIGHT	10

20005

++#define BFQ_DEFAULT_GRP_IOPRIO	0

20006

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

20007

++

20008

++struct bfq_entity;

20009

++

20010

++/**

20011

++ * struct bfq_service_tree - per ioprio_class service tree.

20012

++ * @active: tree for active entities (i.e., those backlogged).

20013

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

20014

++ * @first_idle: idle entity with minimum F_i.

20015

++ * @last_idle: idle entity with maximum F_i.

20016

++ * @vtime: scheduler virtual time.

20017

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

20018

++ *

20019

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

20020

++ * ioprio_class has its own independent scheduler, and so its own

20021

++ * bfq_service_tree.  All the fields are protected by the queue lock

20022

++ * of the containing bfqd.

20023

++ */

20024

++struct bfq_service_tree {

20025

++	struct rb_root active;

20026

++	struct rb_root idle;

20027

++

20028

++	struct bfq_entity *first_idle;

20029

++	struct bfq_entity *last_idle;

20030

++

20031

++	u64 vtime;

20032

++	unsigned long wsum;

20033

++};

20034

++

20035

++/**

20036

++ * struct bfq_sched_data - multi-class scheduler.

20037

++ * @in_service_entity: entity under service.

20038

++ * @next_in_service: head-of-the-line entity in the scheduler.

20039

++ * @service_tree: array of service trees, one per ioprio_class.

20040

++ *

20041

++ * bfq_sched_data is the basic scheduler queue.  It supports three

20042

++ * ioprio_classes, and can be used either as a toplevel queue or as

20043

++ * an intermediate queue on a hierarchical setup.

20044

++ * @next_in_service points to the active entity of the sched_data

20045

++ * service trees that will be scheduled next.

20046

++ *

20047

++ * The supported ioprio_classes are the same as in CFQ, in descending

20048

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

20049

++ * Requests from higher priority queues are served before all the

20050

++ * requests from lower priority queues; among requests of the same

20051

++ * queue requests are served according to B-WF2Q+.

20052

++ * All the fields are protected by the queue lock of the containing bfqd.

20053

++ */

20054

++struct bfq_sched_data {

20055

++	struct bfq_entity *in_service_entity;

20056

++	struct bfq_entity *next_in_service;

20057

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

20058

++};

20059

++

20060

++/**

20061

++ * struct bfq_entity - schedulable entity.

20062

++ * @rb_node: service_tree member.

20063

++ * @on_st: flag, true if the entity is on a tree (either the active or

20064

++ *         the idle one of its service_tree).

20065

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

20066

++ * @start: B-WF2Q+ start timestamp (aka S_i).

20067

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

20068

++ * @min_start: minimum start time of the (active) subtree rooted at

20069

++ *             this entity; used for O(log N) lookups into active trees.

20070

++ * @service: service received during the last round of service.

20071

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

20072

++ * @weight: weight of the queue

20073

++ * @parent: parent entity, for hierarchical scheduling.

20074

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

20075

++ *                 associated scheduler queue, %NULL on leaf nodes.

20076

++ * @sched_data: the scheduler queue this entity belongs to.

20077

++ * @ioprio: the ioprio in use.

20078

++ * @new_weight: when a weight change is requested, the new weight value.

20079

++ * @orig_weight: original weight, used to implement weight boosting

20080

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

20081

++ * @ioprio_class: the ioprio_class in use.

20082

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

20083

++ *                    ioprio_class value.

20084

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

20085

++ *                  ioprio_class change.

20086

++ *

20087

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

20088

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

20089

++ * entity belongs to the sched_data of the parent group in the cgroup

20090

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

20091

++ * in @my_sched_data.

20092

++ *

20093

++ * Each entity stores independently its priority values; this would

20094

++ * allow different weights on different devices, but this

20095

++ * functionality is not exported to userspace by now.  Priorities and

20096

++ * weights are updated lazily, first storing the new values into the

20097

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

20098

++ * there is a transition in the entity state that allows the priority

20099

++ * update to take place the effective and the requested priority

20100

++ * values are synchronized.

20101

++ *

20102

++ * Unless cgroups are used, the weight value is calculated from the

20103

++ * ioprio to export the same interface as CFQ.  When dealing with

20104

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

20105

++ * time to consume their budget and have true sequential behavior, and

20106

++ * when there are no external factors breaking anticipation) the

20107

++ * relative weights at each level of the cgroups hierarchy should be

20108

++ * guaranteed.  All the fields are protected by the queue lock of the

20109

++ * containing bfqd.

20110

++ */

20111

++struct bfq_entity {

20112

++	struct rb_node rb_node;

20113

++

20114

++	int on_st;

20115

++

20116

++	u64 finish;

20117

++	u64 start;

20118

++

20119

++	struct rb_root *tree;

20120

++

20121

++	u64 min_start;

20122

++

20123

++	unsigned long service, budget;

20124

++	unsigned short weight, new_weight;

20125

++	unsigned short orig_weight;

20126

++

20127

++	struct bfq_entity *parent;

20128

++

20129

++	struct bfq_sched_data *my_sched_data;

20130

++	struct bfq_sched_data *sched_data;

20131

++

20132

++	unsigned short ioprio, new_ioprio;

20133

++	unsigned short ioprio_class, new_ioprio_class;

20134

++

20135

++	int ioprio_changed;

20136

++};

20137

++

20138

++struct bfq_group;

20139

++

20140

++/**

20141

++ * struct bfq_queue - leaf schedulable entity.

20142

++ * @ref: reference counter.

20143

++ * @bfqd: parent bfq_data.

20144

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

20145

++ *           one or more other queues.

20146

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

20147

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

20148

++ * @sort_list: sorted list of pending requests.

20149

++ * @next_rq: if fifo isn't expired, next request to serve.

20150

++ * @queued: nr of requests queued in @sort_list.

20151

++ * @allocated: currently allocated requests.

20152

++ * @meta_pending: pending metadata requests.

20153

++ * @fifo: fifo list of requests in sort_list.

20154

++ * @entity: entity representing this queue in the scheduler.

20155

++ * @max_budget: maximum budget allowed from the feedback mechanism.

20156

++ * @budget_timeout: budget expiration (in jiffies).

20157

++ * @dispatched: number of requests on the dispatch list or inside driver.

20158

++ * @org_ioprio: saved ioprio during boosted periods.

20159

++ * @flags: status flags.

20160

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

20161

++ * @seek_samples: number of seeks sampled

20162

++ * @seek_total: sum of the distances of the seeks sampled

20163

++ * @seek_mean: mean seek distance

20164

++ * @last_request_pos: position of the last request enqueued

20165

++ * @pid: pid of the process owning the queue, used for logging purposes.

20166

++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

20167

++ * @raising_cur_max_time: current max raising time for this queue

20168

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

20169

++ *                      idle to backlogged

20170

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

20171

++ *                           since the last transition from idle to backlogged

20172

++ *

20173

++ * A bfq_queue is a leaf request queue; it can be associated to an io_context

20174

++ * or more (if it is an async one).  @cgroup holds a reference to the

20175

++ * cgroup, to be sure that it does not disappear while a bfqq still

20176

++ * references it (mostly to avoid races between request issuing and task

20177

++ * migration followed by cgroup distruction).

20178

++ * All the fields are protected by the queue lock of the containing bfqd.

20179

++ */

20180

++struct bfq_queue {

20181

++	atomic_t ref;

20182

++	struct bfq_data *bfqd;

20183

++

20184

++	/* fields for cooperating queues handling */

20185

++	struct bfq_queue *new_bfqq;

20186

++	struct rb_node pos_node;

20187

++	struct rb_root *pos_root;

20188

++

20189

++	struct rb_root sort_list;

20190

++	struct request *next_rq;

20191

++	int queued[2];

20192

++	int allocated[2];

20193

++	int meta_pending;

20194

++	struct list_head fifo;

20195

++

20196

++	struct bfq_entity entity;

20197

++

20198

++	unsigned long max_budget;

20199

++	unsigned long budget_timeout;

20200

++

20201

++	int dispatched;

20202

++

20203

++	unsigned short org_ioprio;

20204

++

20205

++	unsigned int flags;

20206

++

20207

++	struct list_head bfqq_list;

20208

++

20209

++	unsigned int seek_samples;

20210

++	u64 seek_total;

20211

++	sector_t seek_mean;

20212

++	sector_t last_request_pos;

20213

++

20214

++	pid_t pid;

20215

++

20216

++	/* weight-raising fields */

20217

++	unsigned long raising_cur_max_time;

20218

++	unsigned long soft_rt_next_start;

20219

++	unsigned long last_rais_start_finish;

20220

++	unsigned int raising_coeff;

20221

++	unsigned long last_idle_bklogged;

20222

++	unsigned long service_from_backlogged;

20223

++};

20224

++

20225

++/**

20226

++ * struct bfq_ttime - per process thinktime stats.

20227

++ * @ttime_total: total process thinktime

20228

++ * @ttime_samples: number of thinktime samples

20229

++ * @ttime_mean: average process thinktime

20230

++ */

20231

++struct bfq_ttime {

20232

++	unsigned long last_end_request;

20233

++

20234

++	unsigned long ttime_total;

20235

++	unsigned long ttime_samples;

20236

++	unsigned long ttime_mean;

20237

++};

20238

++

20239

++/**

20240

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

20241

++ * @icq: associated io_cq structure

20242

++ * @bfqq: array of two process queues, the sync and the async

20243

++ * @ttime: associated @bfq_ttime struct

20244

++ */

20245

++struct bfq_io_cq {

20246

++	struct io_cq icq; /* must be the first member */

20247

++	struct bfq_queue *bfqq[2];

20248

++	struct bfq_ttime ttime;

20249

++	int ioprio;

20250

++};

20251

++

20252

++/**

20253

++ * struct bfq_data - per device data structure.

20254

++ * @queue: request queue for the managed device.

20255

++ * @root_group: root bfq_group for the device.

20256

++ * @rq_pos_tree: rbtree sorted by next_request position,

20257

++ *		used when determining if two or more queues

20258

++ *		have interleaving requests (see bfq_close_cooperator).

20259

++ * @busy_queues: number of bfq_queues containing requests (including the

20260

++ *		 queue under service, even if it is idling).

20261

++ * @raised_busy_queues: number of weight-raised busy bfq_queues.

20262

++ * @queued: number of queued requests.

20263

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

20264

++ * @sync_flight: number of sync requests in the driver.

20265

++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

20266

++ *		      completed requests .

20267

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

20268

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

20269

++ * @budgets_assigned: number of budgets assigned.

20270

++ * @idle_slice_timer: timer set when idling for the next sequential request

20271

++ *                    from the queue under service.

20272

++ * @unplug_work: delayed work to restart dispatching on the request queue.

20273

++ * @in_service_queue: bfq_queue under service.

20274

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

20275

++ * @last_position: on-disk position of the last served request.

20276

++ * @last_budget_start: beginning of the last budget.

20277

++ * @last_idling_start: beginning of the last idle slice.

20278

++ * @peak_rate: peak transfer rate observed for a budget.

20279

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

20280

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

20281

++ * @group_list: list of all the bfq_groups active on the device.

20282

++ * @active_list: list of all the bfq_queues active on the device.

20283

++ * @idle_list: list of all the bfq_queues idle on the device.

20284

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

20285

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

20286

++ *                   requests are served in fifo order.

20287

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

20288

++ * @bfq_back_max: maximum allowed backward seek.

20289

++ * @bfq_slice_idle: maximum idling time.

20290

++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

20291

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

20292

++ *                           async queues.

20293

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

20294

++ *               to prevent seeky queues to impose long latencies to well

20295

++ *               behaved ones (this also implies that seeky queues cannot

20296

++ *               receive guarantees in the service domain; after a timeout

20297

++ *               they are charged for the whole allocated budget, to try

20298

++ *               to preserve a behavior reasonably fair among them, but

20299

++ *               without service-domain guarantees).

20300

++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

20301

++ *                            queue is multiplied

20302

++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

20303

++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

20304

++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

20305

++ *			       may be reactivated for a queue (in jiffies)

20306

++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

20307

++ *				     after which weight-raising may be

20308

++ *				     reactivated for an already busy queue

20309

++ *				     (in jiffies)

20310

++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

20311

++ *			         sectors per seconds

20312

++ * @RT_prod: cached value of the product R*T used for computing the maximum

20313

++ *	     duration of the weight raising automatically

20314

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

20315

++ *

20316

++ * All the fields are protected by the @queue lock.

20317

++ */

20318

++struct bfq_data {

20319

++	struct request_queue *queue;

20320

++

20321

++	struct bfq_group *root_group;

20322

++

20323

++	struct rb_root rq_pos_tree;

20324

++

20325

++	int busy_queues;

20326

++	int raised_busy_queues;

20327

++	int queued;

20328

++	int rq_in_driver;

20329

++	int sync_flight;

20330

++

20331

++	int max_rq_in_driver;

20332

++	int hw_tag_samples;

20333

++	int hw_tag;

20334

++

20335

++	int budgets_assigned;

20336

++

20337

++	struct timer_list idle_slice_timer;

20338

++	struct work_struct unplug_work;

20339

++

20340

++	struct bfq_queue *in_service_queue;

20341

++	struct bfq_io_cq *in_service_bic;

20342

++

20343

++	sector_t last_position;

20344

++

20345

++	ktime_t last_budget_start;

20346

++	ktime_t last_idling_start;

20347

++	int peak_rate_samples;

20348

++	u64 peak_rate;

20349

++	unsigned long bfq_max_budget;

20350

++

20351

++	struct hlist_head group_list;

20352

++	struct list_head active_list;

20353

++	struct list_head idle_list;

20354

++

20355

++	unsigned int bfq_quantum;

20356

++	unsigned int bfq_fifo_expire[2];

20357

++	unsigned int bfq_back_penalty;

20358

++	unsigned int bfq_back_max;

20359

++	unsigned int bfq_slice_idle;

20360

++	u64 bfq_class_idle_last_service;

20361

++

20362

++	unsigned int bfq_user_max_budget;

20363

++	unsigned int bfq_max_budget_async_rq;

20364

++	unsigned int bfq_timeout[2];

20365

++

20366

++	bool low_latency;

20367

++

20368

++	/* parameters of the low_latency heuristics */

20369

++	unsigned int bfq_raising_coeff;

20370

++	unsigned int bfq_raising_max_time;

20371

++	unsigned int bfq_raising_rt_max_time;

20372

++	unsigned int bfq_raising_min_idle_time;

20373

++	unsigned long bfq_raising_min_inter_arr_async;

20374

++	unsigned int bfq_raising_max_softrt_rate;

20375

++	u64 RT_prod;

20376

++

20377

++	struct bfq_queue oom_bfqq;

20378

++};

20379

++

20380

++enum bfqq_state_flags {

20381

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

20382

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

20383

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

20384

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

20385

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

20386

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

20387

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

20388

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

20389

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

20390

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

20391

++	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

20392

++};

20393

++

20394

++#define BFQ_BFQQ_FNS(name)						\

20395

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

20396

++{									\

20397

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

20398

++}									\

20399

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

20400

++{									\

20401

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

20402

++}									\

20403

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

20404

++{									\

20405

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

20406

++}

20407

++

20408

++BFQ_BFQQ_FNS(busy);

20409

++BFQ_BFQQ_FNS(wait_request);

20410

++BFQ_BFQQ_FNS(must_alloc);

20411

++BFQ_BFQQ_FNS(fifo_expire);

20412

++BFQ_BFQQ_FNS(idle_window);

20413

++BFQ_BFQQ_FNS(prio_changed);

20414

++BFQ_BFQQ_FNS(sync);

20415

++BFQ_BFQQ_FNS(budget_new);

20416

++BFQ_BFQQ_FNS(coop);

20417

++BFQ_BFQQ_FNS(split_coop);

20418

++BFQ_BFQQ_FNS(softrt_update);

20419

++#undef BFQ_BFQQ_FNS

20420

++

20421

++/* Logging facilities. */

20422

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

20423

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

20424

++

20425

++#define bfq_log(bfqd, fmt, args...) \

20426

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

20427

++

20428

++/* Expiration reasons. */

20429

++enum bfqq_expiration {

20430

++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

20431

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

20432

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

20433

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

20434

++};

20435

++

20436

++#ifdef CONFIG_CGROUP_BFQIO

20437

++/**

20438

++ * struct bfq_group - per (device, cgroup) data structure.

20439

++ * @entity: schedulable entity to insert into the parent group sched_data.

20440

++ * @sched_data: own sched_data, to contain child entities (they may be

20441

++ *              both bfq_queues and bfq_groups).

20442

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

20443

++ *              list of the containing cgroup's bfqio_cgroup.

20444

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

20445

++ *             of the groups active on the same device; used for cleanup.

20446

++ * @bfqd: the bfq_data for the device this group acts upon.

20447

++ * @async_bfqq: array of async queues for all the tasks belonging to

20448

++ *              the group, one queue per ioprio value per ioprio_class,

20449

++ *              except for the idle class that has only one queue.

20450

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

20451

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

20452

++ *             to avoid too many special cases during group creation/migration.

20453

++ *

20454

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

20455

++ * there is a set of bfq_groups, each one collecting the lower-level

20456

++ * entities belonging to the group that are acting on the same device.

20457

++ *

20458

++ * Locking works as follows:

20459

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

20460

++ *      via RCU from its readers.

20461

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

20462

++ *      from the readers.

20463

++ *    o All the other fields are protected by the @bfqd queue lock.

20464

++ */

20465

++struct bfq_group {

20466

++	struct bfq_entity entity;

20467

++	struct bfq_sched_data sched_data;

20468

++

20469

++	struct hlist_node group_node;

20470

++	struct hlist_node bfqd_node;

20471

++

20472

++	void *bfqd;

20473

++

20474

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

20475

++	struct bfq_queue *async_idle_bfqq;

20476

++

20477

++	struct bfq_entity *my_entity;

20478

++};

20479

++

20480

++/**

20481

++ * struct bfqio_cgroup - bfq cgroup data structure.

20482

++ * @css: subsystem state for bfq in the containing cgroup.

20483

++ * @online: flag marked when the subsystem is inserted.

20484

++ * @weight: cgroup weight.

20485

++ * @ioprio: cgroup ioprio.

20486

++ * @ioprio_class: cgroup ioprio_class.

20487

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

20488

++ * @group_data: list containing the bfq_group belonging to this cgroup.

20489

++ *

20490

++ * @group_data is accessed using RCU, with @lock protecting the updates,

20491

++ * @ioprio and @ioprio_class are protected by @lock.

20492

++ */

20493

++struct bfqio_cgroup {

20494

++	struct cgroup_subsys_state css;

20495

++	bool online;

20496

++

20497

++	unsigned short weight, ioprio, ioprio_class;

20498

++

20499

++	spinlock_t lock;

20500

++	struct hlist_head group_data;

20501

++};

20502

++#else

20503

++struct bfq_group {

20504

++	struct bfq_sched_data sched_data;

20505

++

20506

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

20507

++	struct bfq_queue *async_idle_bfqq;

20508

++};

20509

++#endif

20510

++

20511

++static inline struct bfq_service_tree *

20512

++bfq_entity_service_tree(struct bfq_entity *entity)

20513

++{

20514

++	struct bfq_sched_data *sched_data = entity->sched_data;

20515

++	unsigned int idx = entity->ioprio_class - 1;

20516

++

20517

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

20518

++	BUG_ON(sched_data == NULL);

20519

++

20520

++	return sched_data->service_tree + idx;

20521

++}

20522

++

20523

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

20524

++					    int is_sync)

20525

++{

20526

++	return bic->bfqq[!!is_sync];

20527

++}

20528

++

20529

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

20530

++				struct bfq_queue *bfqq, int is_sync)

20531

++{

20532

++	bic->bfqq[!!is_sync] = bfqq;

20533

++}

20534

++

20535

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

20536

++{

20537

++	return bic->icq.q->elevator->elevator_data;

20538

++}

20539

++

20540

++/**

20541

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

20542

++ * @ptr: a pointer to a bfqd.

20543

++ * @flags: storage for the flags to be saved.

20544

++ *

20545

++ * This function allows bfqg->bfqd to be protected by the

20546

++ * queue lock of the bfqd they reference; the pointer is dereferenced

20547

++ * under RCU, so the storage for bfqd is assured to be safe as long

20548

++ * as the RCU read side critical section does not end.  After the

20549

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

20550

++ * sure that no other writer accessed it.  If we raced with a writer,

20551

++ * the function returns NULL, with the queue unlocked, otherwise it

20552

++ * returns the dereferenced pointer, with the queue locked.

20553

++ */

20554

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

20555

++						   unsigned long *flags)

20556

++{

20557

++	struct bfq_data *bfqd;

20558

++

20559

++	rcu_read_lock();

20560

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

20561

++

20562

++	if (bfqd != NULL) {

20563

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

20564

++		if (*ptr == bfqd)

20565

++			goto out;

20566

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

20567

++	}

20568

++

20569

++	bfqd = NULL;

20570

++out:

20571

++	rcu_read_unlock();

20572

++	return bfqd;

20573

++}

20574

++

20575

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

20576

++				       unsigned long *flags)

20577

++{

20578

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

20579

++}

20580

++

20581

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

20582

++static void bfq_put_queue(struct bfq_queue *bfqq);

20583

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

20584

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

20585

++				       struct bfq_group *bfqg, int is_sync,

20586

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

20587

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

20588

++					 struct bfq_group *bfqg);

20589

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

20590

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

20591

++#endif

20592

+--

20593

+1.8.5.2

20594

+

20595

20596

Added: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

20597

===================================================================

20598

--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	                        (rev 0)

20599

+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	2014-02-07 15:42:35 UTC (rev 2666)

20600

@@ -0,0 +1,1034 @@

20601

+From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001

20602

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

20603

+Date: Thu, 23 Jan 2014 16:54:44 +0100

20604

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for

20605

+ 3.13.0

20606

+

20607

+A set of processes may happen  to  perform interleaved reads, i.e., requests

20608

+whose union would give rise to a  sequential read  pattern.  There are two

20609

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

20610

+data at a fixed distance from each other, while in the second case processes

20611

+may read variable-size chunks at  variable distances. The latter case occurs

20612

+for  example with  KVM, which  splits the  I/O generated  by the  guest into

20613

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

20614

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

20615

+available  process. CFQ  uses actual  queue merging  for the  first type  of

20616

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

20617

+of the read requests  performed by the second type of  processes. In the end

20618

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

20619

+throughput with interleaved I/O.

20620

+

20621

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

20622

+sequential  read pattern  with both  types of  processes. The  main idea  is

20623

+checking newly arrived requests against the next request of the active queue

20624

+both in case of actual request insert and in case of request merge. By doing

20625

+so, both the types of processes can be handled by just merging their queues.

20626

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

20627

+CFQ.

20628

+

20629

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

20630

+properly restoring the weight-raising state of  a queue when it gets back to

20631

+a non-merged state.

20632

+

20633

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

20634

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

20635

+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

20636

+---

20637

+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

20638

+ block/bfq-sched.c   |  28 ---

20639

+ block/bfq.h         |  16 ++

20640

+ 3 files changed, 474 insertions(+), 227 deletions(-)

20641

+

20642

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

20643

+index eb760de..06ee844 100644

20644

+--- a/block/bfq-iosched.c

20645

++++ b/block/bfq-iosched.c

20646

+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

20647

+ 	return dur;

20648

+ }

20649

+

20650

++static inline void

20651

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

20652

++{

20653

++	if (bic->saved_idle_window)

20654

++		bfq_mark_bfqq_idle_window(bfqq);

20655

++	else

20656

++		bfq_clear_bfqq_idle_window(bfqq);

20657

++	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

20658

++		/*

20659

++		 * Start a weight raising period with the duration given by

20660

++		 * the raising_time_left snapshot.

20661

++		 */

20662

++		if (bfq_bfqq_busy(bfqq))

20663

++			bfqq->bfqd->raised_busy_queues++;

20664

++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

20665

++		bfqq->raising_cur_max_time = bic->raising_time_left;

20666

++		bfqq->last_rais_start_finish = jiffies;

20667

++		bfqq->entity.ioprio_changed = 1;

20668

++	}

20669

++	/*

20670

++	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

20671

++	 * getting confused about the queue's need of a weight-raising

20672

++	 * period.

20673

++	 */

20674

++	bic->raising_time_left = 0;

20675

++}

20676

++

20677

++/*

20678

++ * Must be called with the queue_lock held.

20679

++ */

20680

++static int bfqq_process_refs(struct bfq_queue *bfqq)

20681

++{

20682

++	int process_refs, io_refs;

20683

++

20684

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

20685

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

20686

++	BUG_ON(process_refs < 0);

20687

++	return process_refs;

20688

++}

20689

++

20690

+ static void bfq_add_rq_rb(struct request *rq)

20691

+ {

20692

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

20693

+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

20694

+ 		if (!bfqd->low_latency)

20695

+ 			goto add_bfqq_busy;

20696

+

20697

++		if (bfq_bfqq_just_split(bfqq))

20698

++			goto set_ioprio_changed;

20699

++

20700

+ 		/*

20701

+-		 * If the queue is not being boosted and has been idle

20702

+-		 * for enough time, start a weight-raising period

20703

++		 * If the queue:

20704

++		 * - is not being boosted,

20705

++		 * - has been idle for enough time,

20706

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

20707

++		 *   shared "for its nature" or it is not shared and its

20708

++		 *   requests have not been redirected to a shared queue)

20709

++		 * start a weight-raising period.

20710

+ 		 */

20711

+-		if (old_raising_coeff == 1 &&

20712

+-		    (idle_for_long_time || soft_rt)) {

20713

++		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

20714

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

20715

+ 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

20716

+ 			if (idle_for_long_time)

20717

+ 				bfqq->raising_cur_max_time =

20718

+@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)

20719

+ 					bfqd->bfq_raising_rt_max_time;

20720

+ 			}

20721

+ 		}

20722

++set_ioprio_changed:

20723

+ 		if (old_raising_coeff != bfqq->raising_coeff)

20724

+ 			entity->ioprio_changed = 1;

20725

+ add_bfqq_busy:

20726

+@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

20727

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

20728

+ }

20729

+

20730

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

20731

+-			   struct bio *bio)

20732

+-{

20733

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

20734

+-	struct bfq_io_cq *bic;

20735

+-	struct bfq_queue *bfqq;

20736

+-

20737

+-	/*

20738

+-	 * Disallow merge of a sync bio into an async request.

20739

+-	 */

20740

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

20741

+-		return 0;

20742

+-

20743

+-	/*

20744

+-	 * Lookup the bfqq that this bio will be queued with. Allow

20745

+-	 * merge only if rq is queued there.

20746

+-	 * Queue lock is held here.

20747

+-	 */

20748

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

20749

+-	if (bic == NULL)

20750

+-		return 0;

20751

+-

20752

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

20753

+-	return bfqq == RQ_BFQQ(rq);

20754

+-}

20755

+-

20756

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

20757

+-				       struct bfq_queue *bfqq)

20758

+-{

20759

+-	if (bfqq != NULL) {

20760

+-		bfq_mark_bfqq_must_alloc(bfqq);

20761

+-		bfq_mark_bfqq_budget_new(bfqq);

20762

+-		bfq_clear_bfqq_fifo_expire(bfqq);

20763

+-

20764

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

20765

+-

20766

+-		bfq_log_bfqq(bfqd, bfqq,

20767

+-			     "set_in_service_queue, cur-budget = %lu",

20768

+-			     bfqq->entity.budget);

20769

+-	}

20770

+-

20771

+-	bfqd->in_service_queue = bfqq;

20772

+-}

20773

+-

20774

+-/*

20775

+- * Get and set a new queue for service.

20776

+- */

20777

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

20778

+-						  struct bfq_queue *bfqq)

20779

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

20780

+ {

20781

+-	if (!bfqq)

20782

+-		bfqq = bfq_get_next_queue(bfqd);

20783

++	if (request)

20784

++		return blk_rq_pos(io_struct);

20785

+ 	else

20786

+-		bfq_get_next_queue_forced(bfqd, bfqq);

20787

+-

20788

+-	__bfq_set_in_service_queue(bfqd, bfqq);

20789

+-	return bfqq;

20790

++		return ((struct bio *)io_struct)->bi_sector;

20791

+ }

20792

+

20793

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

20794

+-					  struct request *rq)

20795

++static inline sector_t bfq_dist_from(sector_t pos1,

20796

++				     sector_t pos2)

20797

+ {

20798

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

20799

+-		return blk_rq_pos(rq) - bfqd->last_position;

20800

++	if (pos1 >= pos2)

20801

++		return pos1 - pos2;

20802

+ 	else

20803

+-		return bfqd->last_position - blk_rq_pos(rq);

20804

++		return pos2 - pos1;

20805

+ }

20806

+

20807

+-/*

20808

+- * Return true if bfqq has no request pending and rq is close enough to

20809

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

20810

+- * bfqq->next_rq

20811

+- */

20812

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

20813

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

20814

++					 sector_t sector)

20815

+ {

20816

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

20817

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

20818

++	       BFQQ_SEEK_THR;

20819

+ }

20820

+

20821

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

20822

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

20823

+ {

20824

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

20825

+ 	struct rb_node *parent, *node;

20826

+ 	struct bfq_queue *__bfqq;

20827

+-	sector_t sector = bfqd->last_position;

20828

+

20829

+ 	if (RB_EMPTY_ROOT(root))

20830

+ 		return NULL;

20831

+@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

20832

+ 	 * position).

20833

+ 	 */

20834

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

20835

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

20836

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

20837

+ 		return __bfqq;

20838

+

20839

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

20840

+@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

20841

+ 		return NULL;

20842

+

20843

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

20844

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

20845

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

20846

+ 		return __bfqq;

20847

+

20848

+ 	return NULL;

20849

+@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

20850

+ /*

20851

+  * bfqd - obvious

20852

+  * cur_bfqq - passed in so that we don't decide that the current queue

20853

+- *            is closely cooperating with itself.

20854

+- *

20855

+- * We are assuming that cur_bfqq has dispatched at least one request,

20856

+- * and that bfqd->last_position reflects a position on the disk associated

20857

+- * with the I/O issued by cur_bfqq.

20858

++ *            is closely cooperating with itself

20859

++ * sector - used as a reference point to search for a close queue

20860

+  */

20861

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

20862

+-					      struct bfq_queue *cur_bfqq)

20863

++					      struct bfq_queue *cur_bfqq,

20864

++					      sector_t sector)

20865

+ {

20866

+ 	struct bfq_queue *bfqq;

20867

+

20868

+@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

20869

+ 	 * working closely on the same area of the disk. In that case,

20870

+ 	 * we can group them together and don't waste time idling.

20871

+ 	 */

20872

+-	bfqq = bfqq_close(bfqd);

20873

++	bfqq = bfqq_close(bfqd, sector);

20874

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

20875

+ 		return NULL;

20876

+

20877

+@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

20878

+ 	return bfqq;

20879

+ }

20880

+

20881

++static struct bfq_queue *

20882

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

20883

++{

20884

++	int process_refs, new_process_refs;

20885

++	struct bfq_queue *__bfqq;

20886

++

20887

++	/*

20888

++	 * If there are no process references on the new_bfqq, then it is

20889

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

20890

++	 * may have dropped their last reference (not just their last process

20891

++	 * reference).

20892

++	 */

20893

++	if (!bfqq_process_refs(new_bfqq))

20894

++		return NULL;

20895

++

20896

++	/* Avoid a circular list and skip interim queue merges. */

20897

++	while ((__bfqq = new_bfqq->new_bfqq)) {

20898

++		if (__bfqq == bfqq)

20899

++			return NULL;

20900

++		new_bfqq = __bfqq;

20901

++	}

20902

++

20903

++	process_refs = bfqq_process_refs(bfqq);

20904

++	new_process_refs = bfqq_process_refs(new_bfqq);

20905

++	/*

20906

++	 * If the process for the bfqq has gone away, there is no

20907

++	 * sense in merging the queues.

20908

++	 */

20909

++	if (process_refs == 0 || new_process_refs == 0)

20910

++		return NULL;

20911

++

20912

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

20913

++		new_bfqq->pid);

20914

++

20915

++	/*

20916

++	 * Merging is just a redirection: the requests of the process owning

20917

++	 * one of the two queues are redirected to the other queue. The latter

20918

++	 * queue, in its turn, is set as shared if this is the first time that

20919

++	 * the requests of some process are redirected to it.

20920

++	 *

20921

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

20922

++	 * are in the context of the process owning bfqq, hence we have the

20923

++	 * io_cq of this process. So we can immediately configure this io_cq

20924

++	 * to redirect the requests of the process to new_bfqq.

20925

++	 *

20926

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

20927

++	 * io_cq of new_bfqq is not available, because, if the in-service queue

20928

++	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

20929

++	 * in-service queue.

20930

++	 * Redirecting the requests of the process owning bfqq to the currently

20931

++	 * in-service queue is in any case the best option, as we feed the

20932

++	 * in-service queue with new requests close to the last request served

20933

++	 * and, by doing so, hopefully increase the throughput.

20934

++	 */

20935

++	bfqq->new_bfqq = new_bfqq;

20936

++	atomic_add(process_refs, &new_bfqq->ref);

20937

++	return new_bfqq;

20938

++}

20939

++

20940

++/*

20941

++ * Attempt to schedule a merge of bfqq with the currently in-service queue or

20942

++ * with a close queue among the scheduled queues.

20943

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

20944

++ * structure otherwise.

20945

++ */

20946

++static struct bfq_queue *

20947

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

20948

++		     void *io_struct, bool request)

20949

++{

20950

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

20951

++

20952

++	if (bfqq->new_bfqq)

20953

++		return bfqq->new_bfqq;

20954

++

20955

++	if (!io_struct)

20956

++		return NULL;

20957

++

20958

++	in_service_bfqq = bfqd->in_service_queue;

20959

++

20960

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

20961

++	    !bfqd->in_service_bic)

20962

++		goto check_scheduled;

20963

++

20964

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

20965

++		goto check_scheduled;

20966

++

20967

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

20968

++		goto check_scheduled;

20969

++

20970

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

20971

++		goto check_scheduled;

20972

++

20973

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

20974

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

20975

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

20976

++		if (new_bfqq != NULL)

20977

++			return new_bfqq; /* Merge with the in-service queue */

20978

++	}

20979

++

20980

++	/*

20981

++	 * Check whether there is a cooperator among currently scheduled

20982

++	 * queues. The only thing we need is that the bio/request is not

20983

++	 * NULL, as we need it to establish whether a cooperator exists.

20984

++	 */

20985

++check_scheduled:

20986

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

20987

++					bfq_io_struct_pos(io_struct, request));

20988

++	if (new_bfqq)

20989

++		return bfq_setup_merge(bfqq, new_bfqq);

20990

++

20991

++	return NULL;

20992

++}

20993

++

20994

++static inline void

20995

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

20996

++{

20997

++	/*

20998

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

20999

++	 * have already been redirected to a shared queue; both idle window

21000

++	 * and weight raising state have already been saved. Do nothing.

21001

++	 */

21002

++	if (bfqq->bic == NULL)

21003

++		return;

21004

++	if (bfqq->bic->raising_time_left)

21005

++		/*

21006

++		 * This is the queue of a just-started process, and would

21007

++		 * deserve weight raising: we set raising_time_left to the full

21008

++		 * weight-raising duration to trigger weight-raising when and

21009

++		 * if the queue is split and the first request of the queue

21010

++		 * is enqueued.

21011

++		 */

21012

++		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

21013

++	else if (bfqq->raising_coeff > 1) {

21014

++		unsigned long wrais_duration =

21015

++			jiffies - bfqq->last_rais_start_finish;

21016

++		/*

21017

++		 * It may happen that a queue's weight raising period lasts

21018

++		 * longer than its raising_cur_max_time, as weight raising is

21019

++		 * handled only when a request is enqueued or dispatched (it

21020

++		 * does not use any timer). If the weight raising period is

21021

++		 * about to end, don't save it.

21022

++		 */

21023

++		if (bfqq->raising_cur_max_time <= wrais_duration)

21024

++			bfqq->bic->raising_time_left = 0;

21025

++		else

21026

++			bfqq->bic->raising_time_left =

21027

++				bfqq->raising_cur_max_time - wrais_duration;

21028

++		/*

21029

++		 * The bfq_queue is becoming shared or the requests of the

21030

++		 * process owning the queue are being redirected to a shared

21031

++		 * queue. Stop the weight raising period of the queue, as in

21032

++		 * both cases it should not be owned by an interactive or soft

21033

++		 * real-time application.

21034

++		 */

21035

++		bfq_bfqq_end_raising(bfqq);

21036

++	} else

21037

++		bfqq->bic->raising_time_left = 0;

21038

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

21039

++}

21040

++

21041

++static inline void

21042

++bfq_get_bic_reference(struct bfq_queue *bfqq)

21043

++{

21044

++	/*

21045

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

21046

++	 * is about to begin using a shared bfq_queue.

21047

++	 */

21048

++	if (bfqq->bic)

21049

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

21050

++}

21051

++

21052

++static void

21053

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

21054

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

21055

++{

21056

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

21057

++		(long unsigned)new_bfqq->pid);

21058

++	/* Save weight raising and idle window of the merged queues */

21059

++	bfq_bfqq_save_state(bfqq);

21060

++	bfq_bfqq_save_state(new_bfqq);

21061

++	/*

21062

++	 * Grab a reference to the bic, to prevent it from being destroyed

21063

++	 * before being possibly touched by a bfq_split_bfqq().

21064

++	 */

21065

++	bfq_get_bic_reference(bfqq);

21066

++	bfq_get_bic_reference(new_bfqq);

21067

++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

21068

++	bic_set_bfqq(bic, new_bfqq, 1);

21069

++	bfq_mark_bfqq_coop(new_bfqq);

21070

++	/*

21071

++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

21072

++	 * new_bfqq->bic to NULL. bfqq either:

21073

++	 * - does not belong to any bic any more, and hence bfqq->bic must

21074

++	 *   be set to NULL, or

21075

++	 * - is a queue whose owning bics have already been redirected to a

21076

++	 *   different queue, hence the queue is destined to not belong to any

21077

++	 *   bic soon and bfqq->bic is already NULL (therefore the next

21078

++	 *   assignment causes no harm).

21079

++	 */

21080

++	new_bfqq->bic = NULL;

21081

++	bfqq->bic = NULL;

21082

++	bfq_put_queue(bfqq);

21083

++}

21084

++

21085

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

21086

++			   struct bio *bio)

21087

++{

21088

++	struct bfq_data *bfqd = q->elevator->elevator_data;

21089

++	struct bfq_io_cq *bic;

21090

++	struct bfq_queue *bfqq, *new_bfqq;

21091

++

21092

++	/*

21093

++	 * Disallow merge of a sync bio into an async request.

21094

++	 */

21095

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

21096

++		return 0;

21097

++

21098

++	/*

21099

++	 * Lookup the bfqq that this bio will be queued with. Allow

21100

++	 * merge only if rq is queued there.

21101

++	 * Queue lock is held here.

21102

++	 */

21103

++	bic = bfq_bic_lookup(bfqd, current->io_context);

21104

++	if (bic == NULL)

21105

++		return 0;

21106

++

21107

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

21108

++	/*

21109

++	 * We take advantage of this function to perform an early merge

21110

++	 * of the queues of possible cooperating processes.

21111

++	 */

21112

++	if (bfqq != NULL) {

21113

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

21114

++		if (new_bfqq != NULL) {

21115

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

21116

++			/*

21117

++			 * If we get here, the bio will be queued in the shared queue,

21118

++			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

21119

++			 * rq can be merged.

21120

++			 */

21121

++			bfqq = new_bfqq;

21122

++		}

21123

++	}

21124

++

21125

++	return bfqq == RQ_BFQQ(rq);

21126

++}

21127

++

21128

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

21129

++				       struct bfq_queue *bfqq)

21130

++{

21131

++	if (bfqq != NULL) {

21132

++		bfq_mark_bfqq_must_alloc(bfqq);

21133

++		bfq_mark_bfqq_budget_new(bfqq);

21134

++		bfq_clear_bfqq_fifo_expire(bfqq);

21135

++

21136

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

21137

++

21138

++		bfq_log_bfqq(bfqd, bfqq,

21139

++			     "set_in_service_queue, cur-budget = %lu",

21140

++			     bfqq->entity.budget);

21141

++	}

21142

++

21143

++	bfqd->in_service_queue = bfqq;

21144

++}

21145

++

21146

++/*

21147

++ * Get and set a new queue for service.

21148

++ */

21149

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

21150

++{

21151

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

21152

++

21153

++	__bfq_set_in_service_queue(bfqd, bfqq);

21154

++	return bfqq;

21155

++}

21156

++

21157

+ /*

21158

+  * If enough samples have been computed, return the current max budget

21159

+  * stored in bfqd, which is dynamically updated according to the

21160

+@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

21161

+ 	return rq;

21162

+ }

21163

+

21164

+-/*

21165

+- * Must be called with the queue_lock held.

21166

+- */

21167

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

21168

+-{

21169

+-	int process_refs, io_refs;

21170

+-

21171

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

21172

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

21173

+-	BUG_ON(process_refs < 0);

21174

+-	return process_refs;

21175

+-}

21176

+-

21177

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

21178

+-{

21179

+-	int process_refs, new_process_refs;

21180

+-	struct bfq_queue *__bfqq;

21181

+-

21182

+-	/*

21183

+-	 * If there are no process references on the new_bfqq, then it is

21184

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

21185

+-	 * may have dropped their last reference (not just their last process

21186

+-	 * reference).

21187

+-	 */

21188

+-	if (!bfqq_process_refs(new_bfqq))

21189

+-		return;

21190

+-

21191

+-	/* Avoid a circular list and skip interim queue merges. */

21192

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

21193

+-		if (__bfqq == bfqq)

21194

+-			return;

21195

+-		new_bfqq = __bfqq;

21196

+-	}

21197

+-

21198

+-	process_refs = bfqq_process_refs(bfqq);

21199

+-	new_process_refs = bfqq_process_refs(new_bfqq);

21200

+-	/*

21201

+-	 * If the process for the bfqq has gone away, there is no

21202

+-	 * sense in merging the queues.

21203

+-	 */

21204

+-	if (process_refs == 0 || new_process_refs == 0)

21205

+-		return;

21206

+-

21207

+-	/*

21208

+-	 * Merge in the direction of the lesser amount of work.

21209

+-	 */

21210

+-	if (new_process_refs >= process_refs) {

21211

+-		bfqq->new_bfqq = new_bfqq;

21212

+-		atomic_add(process_refs, &new_bfqq->ref);

21213

+-	} else {

21214

+-		new_bfqq->new_bfqq = bfqq;

21215

+-		atomic_add(new_process_refs, &bfqq->ref);

21216

+-	}

21217

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

21218

+-		new_bfqq->pid);

21219

+-}

21220

+-

21221

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

21222

+ {

21223

+ 	struct bfq_entity *entity = &bfqq->entity;

21224

+@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

21225

+  */

21226

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

21227

+ {

21228

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

21229

++	struct bfq_queue *bfqq;

21230

+ 	struct request *next_rq;

21231

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

21232

+

21233

+@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

21234

+

21235

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

21236

+

21237

+-	/*

21238

+-         * If another queue has a request waiting within our mean seek

21239

+-         * distance, let it run. The expire code will check for close

21240

+-         * cooperators and put the close queue at the front of the

21241

+-         * service tree. If possible, merge the expiring queue with the

21242

+-         * new bfqq.

21243

+-         */

21244

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

21245

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

21246

+-                bfq_setup_merge(bfqq, new_bfqq);

21247

+-

21248

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

21249

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

21250

+ 	    !bfq_bfqq_must_idle(bfqq))

21251

+@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

21252

+ 				bfq_clear_bfqq_wait_request(bfqq);

21253

+ 				del_timer(&bfqd->idle_slice_timer);

21254

+ 			}

21255

+-			if (new_bfqq == NULL)

21256

+-				goto keep_queue;

21257

+-			else

21258

+-				goto expire;

21259

++			goto keep_queue;

21260

+ 		}

21261

+ 	}

21262

+

21263

+ 	/*

21264

+-	 * No requests pending.  If the in-service queue has no cooperator and

21265

+-	 * still has requests in flight (possibly waiting for a completion)

21266

+-	 * or is idling for a new request, then keep it.

21267

++	 * No requests pending.  If the in-service queue still has requests in

21268

++	 * flight (possibly waiting for a completion) or is idling for a new

21269

++	 * request, then keep it.

21270

+ 	 */

21271

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

21272

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

21273

++	if (timer_pending(&bfqd->idle_slice_timer) ||

21274

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

21275

+ 		bfqq = NULL;

21276

+ 		goto keep_queue;

21277

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

21278

+-		/*

21279

+-		 * Expiring the queue because there is a close cooperator,

21280

+-		 * cancel timer.

21281

+-		 */

21282

+-		bfq_clear_bfqq_wait_request(bfqq);

21283

+-		del_timer(&bfqd->idle_slice_timer);

21284

+ 	}

21285

+

21286

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

21287

+ expire:

21288

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

21289

+ new_queue:

21290

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

21291

++	bfqq = bfq_set_in_service_queue(bfqd);

21292

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

21293

+ 		bfqq != NULL ? bfqq->pid : 0);

21294

+ keep_queue:

21295

+@@ -1825,9 +2015,8 @@ keep_queue:

21296

+ static void bfq_update_raising_data(struct bfq_data *bfqd,

21297

+ 				    struct bfq_queue *bfqq)

21298

+ {

21299

++	struct bfq_entity *entity = &bfqq->entity;

21300

+ 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

21301

+-		struct bfq_entity *entity = &bfqq->entity;

21302

+-

21303

+ 		bfq_log_bfqq(bfqd, bfqq,

21304

+ 			"raising period dur %u/%u msec, "

21305

+ 			"old raising coeff %u, w %d(%d)",

21306

+@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

21307

+ 			"WARN: pending prio change");

21308

+ 		/*

21309

+ 		 * If too much time has elapsed from the beginning

21310

+-		 * of this weight-raising, stop it.

21311

++		 * of this weight-raising period, stop it.

21312

+ 		 */

21313

+ 		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

21314

+ 					   bfqq->raising_cur_max_time)) {

21315

+@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

21316

+ 				     jiffies_to_msecs(bfqq->

21317

+ 					raising_cur_max_time));

21318

+ 			bfq_bfqq_end_raising(bfqq);

21319

+-			__bfq_entity_update_weight_prio(

21320

+-				bfq_entity_service_tree(entity),

21321

+-				entity);

21322

+ 		}

21323

+ 	}

21324

++	/* Update weight both if it must be raised and if it must be lowered */

21325

++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

21326

++		__bfq_entity_update_weight_prio(

21327

++			bfq_entity_service_tree(entity),

21328

++			entity);

21329

+ }

21330

+

21331

+ /*

21332

+@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)

21333

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

21334

+

21335

+ 	bic->ttime.last_end_request = jiffies;

21336

++	/*

21337

++	 * A newly created bic indicates that the process has just

21338

++	 * started doing I/O, and is probably mapping into memory its

21339

++	 * executable and libraries: it definitely needs weight raising.

21340

++	 * There is however the possibility that the process performs,

21341

++	 * for a while, I/O close to some other process. EQM intercepts

21342

++	 * this behavior and may merge the queue corresponding to the

21343

++	 * process  with some other queue, BEFORE the weight of the queue

21344

++	 * is raised. Merged queues are not weight-raised (they are assumed

21345

++	 * to belong to processes that benefit only from high throughput).

21346

++	 * If the merge is basically the consequence of an accident, then

21347

++	 * the queue will be split soon and will get back its old weight.

21348

++	 * It is then important to write down somewhere that this queue

21349

++	 * does need weight raising, even if it did not make it to get its

21350

++	 * weight raised before being merged. To this purpose, we overload

21351

++	 * the field raising_time_left and assign 1 to it, to mark the queue

21352

++	 * as needing weight raising.

21353

++	 */

21354

++	bic->raising_time_left = 1;

21355

+ }

21356

+

21357

+ static void bfq_exit_icq(struct io_cq *icq)

21358

+@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)

21359

+ 	}

21360

+

21361

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

21362

++		/*

21363

++		 * If the bic is using a shared queue, put the reference

21364

++		 * taken on the io_context when the bic started using a

21365

++		 * shared bfq_queue.

21366

++		 */

21367

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

21368

++			put_io_context(icq->ioc);

21369

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

21370

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

21371

+ 	}

21372

+@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

21373

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

21374

+ 		return;

21375

+

21376

++	/* Idle window just restored, statistics are meaningless. */

21377

++	if (bfq_bfqq_just_split(bfqq))

21378

++		return;

21379

++

21380

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

21381

+

21382

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

21383

+@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

21384

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

21385

+ 	    !BFQQ_SEEKY(bfqq))

21386

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

21387

++	bfq_clear_bfqq_just_split(bfqq);

21388

+

21389

+ 	bfq_log_bfqq(bfqd, bfqq,

21390

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

21391

+@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

21392

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

21393

+ {

21394

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

21395

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

21396

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

21397

+

21398

+ 	assert_spin_locked(bfqd->queue->queue_lock);

21399

++

21400

++	/*

21401

++	 * An unplug may trigger a requeue of a request from the device

21402

++	 * driver: make sure we are in process context while trying to

21403

++	 * merge two bfq_queues.

21404

++	 */

21405

++	if (!in_interrupt()) {

21406

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

21407

++		if (new_bfqq != NULL) {

21408

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

21409

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

21410

++			/*

21411

++			 * Release the request's reference to the old bfqq

21412

++			 * and make sure one is taken to the shared queue.

21413

++			 */

21414

++			new_bfqq->allocated[rq_data_dir(rq)]++;

21415

++			bfqq->allocated[rq_data_dir(rq)]--;

21416

++			atomic_inc(&new_bfqq->ref);

21417

++			bfq_put_queue(bfqq);

21418

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

21419

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

21420

++						bfqq, new_bfqq);

21421

++			rq->elv.priv[1] = new_bfqq;

21422

++			bfqq = new_bfqq;

21423

++		}

21424

++	}

21425

++

21426

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

21427

+

21428

+ 	bfq_add_rq_rb(rq);

21429

+

21430

++	/*

21431

++	 * Here a newly-created bfq_queue has already started a weight-raising

21432

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

21433

++	 * from assigning it a full weight-raising period. See the detailed

21434

++	 * comments about this field in bfq_init_icq().

21435

++	 */

21436

++	if (bfqq->bic != NULL)

21437

++		bfqq->bic->raising_time_left = 0;

21438

+ 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

21439

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

21440

+

21441

+@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)

21442

+ 	}

21443

+ }

21444

+

21445

+-static struct bfq_queue *

21446

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

21447

+-		struct bfq_queue *bfqq)

21448

+-{

21449

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

21450

+-		(long unsigned)bfqq->new_bfqq->pid);

21451

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

21452

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

21453

+-	bfq_put_queue(bfqq);

21454

+-	return bic_to_bfqq(bic, 1);

21455

+-}

21456

+-

21457

+ /*

21458

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

21459

+  * was the last process referring to said bfqq.

21460

+@@ -2679,6 +2924,9 @@ static struct bfq_queue *

21461

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

21462

+ {

21463

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

21464

++

21465

++	put_io_context(bic->icq.ioc);

21466

++

21467

+ 	if (bfqq_process_refs(bfqq) == 1) {

21468

+ 		bfqq->pid = current->pid;

21469

+ 		bfq_clear_bfqq_coop(bfqq);

21470

+@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

21471

+ 	struct bfq_queue *bfqq;

21472

+ 	struct bfq_group *bfqg;

21473

+ 	unsigned long flags;

21474

++	bool split = false;

21475

+

21476

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

21477

+

21478

+@@ -2725,24 +2974,14 @@ new_queue:

21479

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

21480

+ 		bic_set_bfqq(bic, bfqq, is_sync);

21481

+ 	} else {

21482

+-		/*

21483

+-		 * If the queue was seeky for too long, break it apart.

21484

+-		 */

21485

++		/* If the queue was seeky for too long, break it apart. */

21486

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

21487

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

21488

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

21489

++			split = true;

21490

+ 			if (!bfqq)

21491

+ 				goto new_queue;

21492

+ 		}

21493

+-

21494

+-		/*

21495

+-		 * Check to see if this queue is scheduled to merge with

21496

+-		 * another closely cooperating queue. The merging of queues

21497

+-		 * happens here as it must be done in process context.

21498

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

21499

+-		 */

21500

+-		if (bfqq->new_bfqq != NULL)

21501

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

21502

+ 	}

21503

+

21504

+ 	bfqq->allocated[rw]++;

21505

+@@ -2753,6 +2992,26 @@ new_queue:

21506

+ 	rq->elv.priv[0] = bic;

21507

+ 	rq->elv.priv[1] = bfqq;

21508

+

21509

++	/*

21510

++	 * If a bfq_queue has only one process reference, it is owned

21511

++	 * by only one bfq_io_cq: we can set the bic field of the

21512

++	 * bfq_queue to the address of that structure. Also, if the

21513

++	 * queue has just been split, mark a flag so that the

21514

++	 * information is available to the other scheduler hooks.

21515

++	 */

21516

++	if (bfqq_process_refs(bfqq) == 1) {

21517

++		bfqq->bic = bic;

21518

++		if (split) {

21519

++			bfq_mark_bfqq_just_split(bfqq);

21520

++			/*

21521

++			 * If the queue has just been split from a shared queue,

21522

++			 * restore the idle window and the possible weight

21523

++			 * raising period.

21524

++			 */

21525

++			bfq_bfqq_resume_state(bfqq, bic);

21526

++		}

21527

++	}

21528

++

21529

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

21530

+

21531

+ 	return 0;

21532

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

21533

+index 999b475..e54ea33 100644

21534

+--- a/block/bfq-sched.c

21535

++++ b/block/bfq-sched.c

21536

+@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

21537

+ 	return bfqq;

21538

+ }

21539

+

21540

+-/*

21541

+- * Forced extraction of the given queue.

21542

+- */

21543

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

21544

+-				      struct bfq_queue *bfqq)

21545

+-{

21546

+-	struct bfq_entity *entity;

21547

+-	struct bfq_sched_data *sd;

21548

+-

21549

+-	BUG_ON(bfqd->in_service_queue != NULL);

21550

+-

21551

+-	entity = &bfqq->entity;

21552

+-	/*

21553

+-	 * Bubble up extraction/update from the leaf to the root.

21554

+-	*/

21555

+-	for_each_entity(entity) {

21556

+-		sd = entity->sched_data;

21557

+-		bfq_update_budget(entity);

21558

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

21559

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

21560

+-		sd->active_entity = entity;

21561

+-		sd->next_active = NULL;

21562

+-		entity->service = 0;

21563

+-	}

21564

+-

21565

+-	return;

21566

+-}

21567

+-

21568

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

21569

+ {

21570

+ 	if (bfqd->in_service_bic != NULL) {

21571

+diff --git a/block/bfq.h b/block/bfq.h

21572

+index f9b5881..0bfad40 100644

21573

+--- a/block/bfq.h

21574

++++ b/block/bfq.h

21575

+@@ -192,6 +192,8 @@ struct bfq_group;

21576

+  *                      idle to backlogged

21577

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

21578

+  *                           since the last transition from idle to backlogged

21579

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

21580

++ *	 queue is shared

21581

+  *

21582

+  * A bfq_queue is a leaf request queue; it can be associated to an io_context

21583

+  * or more (if it is an async one).  @cgroup holds a reference to the

21584

+@@ -235,6 +237,7 @@ struct bfq_queue {

21585

+ 	sector_t last_request_pos;

21586

+

21587

+ 	pid_t pid;

21588

++	struct bfq_io_cq *bic;

21589

+

21590

+ 	/* weight-raising fields */

21591

+ 	unsigned long raising_cur_max_time;

21592

+@@ -264,12 +267,23 @@ struct bfq_ttime {

21593

+  * @icq: associated io_cq structure

21594

+  * @bfqq: array of two process queues, the sync and the async

21595

+  * @ttime: associated @bfq_ttime struct

21596

++ * @raising_time_left: snapshot of the time left before weight raising ends

21597

++ *		       for the sync queue associated to this process; this

21598

++ *		       snapshot is taken to remember this value while the weight

21599

++ *		       raising is suspended because the queue is merged with a

21600

++ *		       shared queue, and is used to set @raising_cur_max_time

21601

++ *		       when the queue is split from the shared queue and its

21602

++ *		       weight is raised again

21603

++ * @saved_idle_window: same purpose as the previous field for the idle window

21604

+  */

21605

+ struct bfq_io_cq {

21606

+ 	struct io_cq icq; /* must be the first member */

21607

+ 	struct bfq_queue *bfqq[2];

21608

+ 	struct bfq_ttime ttime;

21609

+ 	int ioprio;

21610

++

21611

++	unsigned int raising_time_left;

21612

++	unsigned int saved_idle_window;

21613

+ };

21614

+

21615

+ /**

21616

+@@ -411,6 +425,7 @@ enum bfqq_state_flags {

21617

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

21618

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

21619

+ 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

21620

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

21621

+ 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

21622

+ };

21623

+

21624

+@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

21625

+ BFQ_BFQQ_FNS(budget_new);

21626

+ BFQ_BFQQ_FNS(coop);

21627

+ BFQ_BFQQ_FNS(split_coop);

21628

++BFQ_BFQQ_FNS(just_split);

21629

+ BFQ_BFQQ_FNS(softrt_update);

21630

+ #undef BFQ_BFQQ_FNS

21631

+

21632

+--

21633

+1.8.5.2

21634

+

Gentoo Archives: gentoo-commits