[gentoo-commits] linux-patches r2716 - genpatches-2.6/trunk/3.14 - gentoo-commits

From:	"Mike Pagano (mpagano)" <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] linux-patches r2716 - genpatches-2.6/trunk/3.14
Date:	Mon, 31 Mar 2014 12:03:23
Message-Id:	`20140331120315.2817A2005C@flycatcher.gentoo.org`

1

Author: mpagano

2

Date: 2014-03-31 12:03:14 +0000 (Mon, 31 Mar 2014)

3

New Revision: 2716

4

5

Removed:

6

   genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

7

   genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

8

   genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

9

   genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

10

   genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

11

   genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

12

   genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch

13

Modified:

14

   genpatches-2.6/trunk/3.14/0000_README

15

Log:

16

Temporary removal of BFQ patches until compatible patches are committed

17

18

Modified: genpatches-2.6/trunk/3.14/0000_README

19

===================================================================

20

--- genpatches-2.6/trunk/3.14/0000_README	2014-03-26 23:50:52 UTC (rev 2715)

21

+++ genpatches-2.6/trunk/3.14/0000_README	2014-03-31 12:03:14 UTC (rev 2716)

22

@@ -77,19 +77,3 @@

23

 Patch:  4567_distro-Gentoo-Kconfig.patch

24

 From:   Tom Wijsman <TomWij@g.o>

25

 Desc:   Add Gentoo Linux support config settings and defaults.

26

-

27

-Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch

28

-From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

29

-Desc:   BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits

30

-

31

-Patch:  5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1

32

-From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

33

-Desc:   BFQ v7r1 patch 2 for 3.13: BFQ Scheduler

34

-

35

-Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch

36

-From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

37

-Desc:   BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)

38

-

39

-Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

40

-From:   https://github.com/graysky2/kernel_gcc_patch/

41

-Desc:   Kernel patch enables gcc optimizations for additional CPUs.

42

43

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch

44

===================================================================

45

--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	2014-03-26 23:50:52 UTC (rev 2715)

46

+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch	2014-03-31 12:03:14 UTC (rev 2716)

47

@@ -1,104 +0,0 @@

48

-From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001

49

-From: Paolo Valente <paolo.valente@×××××××.it>

50

-Date: Tue, 3 Sep 2013 16:50:42 +0200

51

-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13

52

-

53

-Update Kconfig.iosched and do the related Makefile changes to include

54

-kernel configuration options for BFQ. Also add the bfqio controller

55

-to the cgroups subsystem.

56

-

57

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

58

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

59

----

60

- block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

61

- block/Makefile                |  1 +

62

- include/linux/cgroup_subsys.h |  4 ++++

63

- 3 files changed, 37 insertions(+)

64

-

65

-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

66

-index 421bef9..8f552ba 100644

67

---- a/block/Kconfig.iosched

68

-+++ b/block/Kconfig.iosched

69

-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

70

- 	---help---

71

- 	  Enable group IO scheduling in CFQ.

72

-

73

-+config IOSCHED_BFQ

74

-+	tristate "BFQ I/O scheduler"

75

-+	default n

76

-+	---help---

77

-+	  The BFQ I/O scheduler tries to distribute bandwidth among

78

-+	  all processes according to their weights.

79

-+	  It aims at distributing the bandwidth as desired, independently of

80

-+	  the disk parameters and with any workload. It also tries to

81

-+	  guarantee low latency to interactive and soft real-time

82

-+	  applications.  If compiled built-in (saying Y here), BFQ can

83

-+	  be configured to support hierarchical scheduling.

84

-+

85

-+config CGROUP_BFQIO

86

-+	bool "BFQ hierarchical scheduling support"

87

-+	depends on CGROUPS && IOSCHED_BFQ=y

88

-+	default n

89

-+	---help---

90

-+	  Enable hierarchical scheduling in BFQ, using the cgroups

91

-+	  filesystem interface.  The name of the subsystem will be

92

-+	  bfqio.

93

-+

94

- choice

95

- 	prompt "Default I/O scheduler"

96

- 	default DEFAULT_CFQ

97

-@@ -52,6 +73,16 @@ choice

98

- 	config DEFAULT_CFQ

99

- 		bool "CFQ" if IOSCHED_CFQ=y

100

-

101

-+	config DEFAULT_BFQ

102

-+		bool "BFQ" if IOSCHED_BFQ=y

103

-+		help

104

-+		  Selects BFQ as the default I/O scheduler which will be

105

-+		  used by default for all block devices.

106

-+		  The BFQ I/O scheduler aims at distributing the bandwidth

107

-+		  as desired, independently of the disk parameters and with

108

-+		  any workload. It also tries to guarantee low latency to

109

-+		  interactive and soft real-time applications.

110

-+

111

- 	config DEFAULT_NOOP

112

- 		bool "No-op"

113

-

114

-@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

115

- 	string

116

- 	default "deadline" if DEFAULT_DEADLINE

117

- 	default "cfq" if DEFAULT_CFQ

118

-+	default "bfq" if DEFAULT_BFQ

119

- 	default "noop" if DEFAULT_NOOP

120

-

121

- endmenu

122

-diff --git a/block/Makefile b/block/Makefile

123

-index 20645e8..cbd83fb 100644

124

---- a/block/Makefile

125

-+++ b/block/Makefile

126

-@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

127

- obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

128

- obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

129

- obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

130

-+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

131

-

132

- obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

133

- obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

134

-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

135

-index b613ffd..43c5dc9 100644

136

---- a/include/linux/cgroup_subsys.h

137

-+++ b/include/linux/cgroup_subsys.h

138

-@@ -39,6 +39,10 @@ SUBSYS(net_cls)

139

- SUBSYS(blkio)

140

- #endif

141

-

142

-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

143

-+SUBSYS(bfqio)

144

-+#endif

145

-+

146

- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

147

- SUBSYS(perf)

148

- #endif

149

---

150

-1.8.5.2

151

-

152

153

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch

154

===================================================================

155

--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	2014-03-26 23:50:52 UTC (rev 2715)

156

+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch	2014-03-31 12:03:14 UTC (rev 2716)

157

@@ -1,104 +0,0 @@

158

-From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001

159

-From: Paolo Valente <paolo.valente@×××××××.it>

160

-Date: Tue, 3 Sep 2013 16:50:42 +0200

161

-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13

162

-

163

-Update Kconfig.iosched and do the related Makefile changes to include

164

-kernel configuration options for BFQ. Also add the bfqio controller

165

-to the cgroups subsystem.

166

-

167

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

168

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

169

----

170

- block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

171

- block/Makefile                |  1 +

172

- include/linux/cgroup_subsys.h |  4 ++++

173

- 3 files changed, 37 insertions(+)

174

-

175

-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

176

-index 421bef9..8f552ba 100644

177

---- a/block/Kconfig.iosched

178

-+++ b/block/Kconfig.iosched

179

-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

180

- 	---help---

181

- 	  Enable group IO scheduling in CFQ.

182

-

183

-+config IOSCHED_BFQ

184

-+	tristate "BFQ I/O scheduler"

185

-+	default n

186

-+	---help---

187

-+	  The BFQ I/O scheduler tries to distribute bandwidth among

188

-+	  all processes according to their weights.

189

-+	  It aims at distributing the bandwidth as desired, independently of

190

-+	  the disk parameters and with any workload. It also tries to

191

-+	  guarantee low latency to interactive and soft real-time

192

-+	  applications.  If compiled built-in (saying Y here), BFQ can

193

-+	  be configured to support hierarchical scheduling.

194

-+

195

-+config CGROUP_BFQIO

196

-+	bool "BFQ hierarchical scheduling support"

197

-+	depends on CGROUPS && IOSCHED_BFQ=y

198

-+	default n

199

-+	---help---

200

-+	  Enable hierarchical scheduling in BFQ, using the cgroups

201

-+	  filesystem interface.  The name of the subsystem will be

202

-+	  bfqio.

203

-+

204

- choice

205

- 	prompt "Default I/O scheduler"

206

- 	default DEFAULT_CFQ

207

-@@ -52,6 +73,16 @@ choice

208

- 	config DEFAULT_CFQ

209

- 		bool "CFQ" if IOSCHED_CFQ=y

210

-

211

-+	config DEFAULT_BFQ

212

-+		bool "BFQ" if IOSCHED_BFQ=y

213

-+		help

214

-+		  Selects BFQ as the default I/O scheduler which will be

215

-+		  used by default for all block devices.

216

-+		  The BFQ I/O scheduler aims at distributing the bandwidth

217

-+		  as desired, independently of the disk parameters and with

218

-+		  any workload. It also tries to guarantee low latency to

219

-+		  interactive and soft real-time applications.

220

-+

221

- 	config DEFAULT_NOOP

222

- 		bool "No-op"

223

-

224

-@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

225

- 	string

226

- 	default "deadline" if DEFAULT_DEADLINE

227

- 	default "cfq" if DEFAULT_CFQ

228

-+	default "bfq" if DEFAULT_BFQ

229

- 	default "noop" if DEFAULT_NOOP

230

-

231

- endmenu

232

-diff --git a/block/Makefile b/block/Makefile

233

-index 20645e8..cbd83fb 100644

234

---- a/block/Makefile

235

-+++ b/block/Makefile

236

-@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

237

- obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

238

- obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

239

- obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

240

-+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

241

-

242

- obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

243

- obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

244

-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

245

-index b613ffd..43c5dc9 100644

246

---- a/include/linux/cgroup_subsys.h

247

-+++ b/include/linux/cgroup_subsys.h

248

-@@ -39,6 +39,10 @@ SUBSYS(net_cls)

249

- SUBSYS(blkio)

250

- #endif

251

-

252

-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

253

-+SUBSYS(bfqio)

254

-+#endif

255

-+

256

- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

257

- SUBSYS(perf)

258

- #endif

259

---

260

-1.8.5.2

261

-

262

263

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1

264

===================================================================

265

--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	2014-03-26 23:50:52 UTC (rev 2715)

266

+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1	2014-03-31 12:03:14 UTC (rev 2716)

267

@@ -1,6008 +0,0 @@

268

-From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001

269

-From: Paolo Valente <paolo.valente@×××××××.it>

270

-Date: Thu, 9 May 2013 19:10:02 +0200

271

-Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13

272

-

273

-Add the BFQ-v7 I/O scheduler to 3.13.

274

-The general structure is borrowed from CFQ, as much of the code for

275

-handling I/O contexts Over time, several useful features have been

276

-ported from CFQ as well (details in the changelog in README.BFQ). A

277

-(bfq_)queue is associated to each task doing I/O on a device, and each

278

-time a scheduling decision has to be made a queue is selected and served

279

-until it expires.

280

-

281

-    - Slices are given in the service domain: tasks are assigned

282

-      budgets, measured in number of sectors. Once got the disk, a task

283

-      must however consume its assigned budget within a configurable

284

-      maximum time (by default, the maximum possible value of the

285

-      budgets is automatically computed to comply with this timeout).

286

-      This allows the desired latency vs "throughput boosting" tradeoff

287

-      to be set.

288

-

289

-    - Budgets are scheduled according to a variant of WF2Q+, implemented

290

-      using an augmented rb-tree to take eligibility into account while

291

-      preserving an O(log N) overall complexity.

292

-

293

-    - A low-latency tunable is provided; if enabled, both interactive

294

-      and soft real-time applications are guaranteed a very low latency.

295

-

296

-    - Latency guarantees are preserved also in the presence of NCQ.

297

-

298

-    - Also with flash-based devices, a high throughput is achieved

299

-      while still preserving latency guarantees.

300

-

301

-    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

302

-      cooperating-queue-merging and the preemption mechanisms present

303

-      in CFQ. EQM is in fact a unified mechanism that tries to get a

304

-      sequential read pattern, and hence a high throughput, with any

305

-      set of processes performing interleaved I/O over a contiguous

306

-      sequence of sectors.

307

-

308

-    - BFQ supports full hierarchical scheduling, exporting a cgroups

309

-      interface.  Since each node has a full scheduler, each group can

310

-      be assigned its own weight.

311

-

312

-    - If the cgroups interface is not used, only I/O priorities can be

313

-      assigned to processes, with ioprio values mapped to weights

314

-      with the relation weight = IOPRIO_BE_NR - ioprio.

315

-

316

-    - ioprio classes are served in strict priority order, i.e., lower

317

-      priority queues are not served as long as there are higher

318

-      priority queues.  Among queues in the same class the bandwidth is

319

-      distributed in proportion to the weight of each queue. A very

320

-      thin extra bandwidth is however guaranteed to the Idle class, to

321

-      prevent it from starving.

322

-

323

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

324

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

325

----

326

- block/bfq-cgroup.c  |  910 ++++++++++++++

327

- block/bfq-ioc.c     |   36 +

328

- block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++

329

- block/bfq-sched.c   | 1077 +++++++++++++++++

330

- block/bfq.h         |  614 ++++++++++

331

- 5 files changed, 5905 insertions(+)

332

- create mode 100644 block/bfq-cgroup.c

333

- create mode 100644 block/bfq-ioc.c

334

- create mode 100644 block/bfq-iosched.c

335

- create mode 100644 block/bfq-sched.c

336

- create mode 100644 block/bfq.h

337

-

338

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

339

-new file mode 100644

340

-index 0000000..b889acf

341

---- /dev/null

342

-+++ b/block/bfq-cgroup.c

343

-@@ -0,0 +1,910 @@

344

-+/*

345

-+ * BFQ: CGROUPS support.

346

-+ *

347

-+ * Based on ideas and code from CFQ:

348

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

349

-+ *

350

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

351

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

352

-+ *

353

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

354

-+ *

355

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

356

-+ */

357

-+

358

-+#ifdef CONFIG_CGROUP_BFQIO

359

-+

360

-+static DEFINE_MUTEX(bfqio_mutex);

361

-+

362

-+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

363

-+{

364

-+	return bgrp ? !bgrp->online : false;

365

-+}

366

-+

367

-+static struct bfqio_cgroup bfqio_root_cgroup = {

368

-+	.weight = BFQ_DEFAULT_GRP_WEIGHT,

369

-+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

370

-+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

371

-+};

372

-+

373

-+static inline void bfq_init_entity(struct bfq_entity *entity,

374

-+				   struct bfq_group *bfqg)

375

-+{

376

-+	entity->weight = entity->new_weight;

377

-+	entity->orig_weight = entity->new_weight;

378

-+	entity->ioprio = entity->new_ioprio;

379

-+	entity->ioprio_class = entity->new_ioprio_class;

380

-+	entity->parent = bfqg->my_entity;

381

-+	entity->sched_data = &bfqg->sched_data;

382

-+}

383

-+

384

-+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

385

-+{

386

-+	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

387

-+}

388

-+

389

-+/*

390

-+ * Search the bfq_group for bfqd into the hash table (by now only a list)

391

-+ * of bgrp.  Must be called under rcu_read_lock().

392

-+ */

393

-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

394

-+					    struct bfq_data *bfqd)

395

-+{

396

-+	struct bfq_group *bfqg;

397

-+	void *key;

398

-+

399

-+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

400

-+		key = rcu_dereference(bfqg->bfqd);

401

-+		if (key == bfqd)

402

-+			return bfqg;

403

-+	}

404

-+

405

-+	return NULL;

406

-+}

407

-+

408

-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

409

-+					 struct bfq_group *bfqg)

410

-+{

411

-+	struct bfq_entity *entity = &bfqg->entity;

412

-+

413

-+	/*

414

-+	 * If the weight of the entity has never been set via the sysfs

415

-+	 * interface, then bgrp->weight == 0. In this case we initialize

416

-+	 * the weight from the current ioprio value. Otherwise, the group

417

-+	 * weight, if set, has priority over the ioprio value.

418

-+	 */

419

-+	if (bgrp->weight == 0) {

420

-+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

421

-+		entity->new_ioprio = bgrp->ioprio;

422

-+	} else {

423

-+		entity->new_weight = bgrp->weight;

424

-+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

425

-+	}

426

-+	entity->orig_weight = entity->weight = entity->new_weight;

427

-+	entity->ioprio = entity->new_ioprio;

428

-+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

429

-+	entity->my_sched_data = &bfqg->sched_data;

430

-+}

431

-+

432

-+static inline void bfq_group_set_parent(struct bfq_group *bfqg,

433

-+					struct bfq_group *parent)

434

-+{

435

-+	struct bfq_entity *entity;

436

-+

437

-+	BUG_ON(parent == NULL);

438

-+	BUG_ON(bfqg == NULL);

439

-+

440

-+	entity = &bfqg->entity;

441

-+	entity->parent = parent->my_entity;

442

-+	entity->sched_data = &parent->sched_data;

443

-+}

444

-+

445

-+/**

446

-+ * bfq_group_chain_alloc - allocate a chain of groups.

447

-+ * @bfqd: queue descriptor.

448

-+ * @css: the leaf cgroup_subsys_state this chain starts from.

449

-+ *

450

-+ * Allocate a chain of groups starting from the one belonging to

451

-+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

452

-+ * to the root has already an allocated group on @bfqd.

453

-+ */

454

-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

455

-+					       struct cgroup_subsys_state *css)

456

-+{

457

-+	struct bfqio_cgroup *bgrp;

458

-+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

459

-+

460

-+	for (; css != NULL; css = css->parent) {

461

-+		bgrp = css_to_bfqio(css);

462

-+

463

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

464

-+		if (bfqg != NULL) {

465

-+			/*

466

-+			 * All the cgroups in the path from there to the

467

-+			 * root must have a bfq_group for bfqd, so we don't

468

-+			 * need any more allocations.

469

-+			 */

470

-+			break;

471

-+		}

472

-+

473

-+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

474

-+		if (bfqg == NULL)

475

-+			goto cleanup;

476

-+

477

-+		bfq_group_init_entity(bgrp, bfqg);

478

-+		bfqg->my_entity = &bfqg->entity;

479

-+

480

-+		if (leaf == NULL) {

481

-+			leaf = bfqg;

482

-+			prev = leaf;

483

-+		} else {

484

-+			bfq_group_set_parent(prev, bfqg);

485

-+			/*

486

-+			 * Build a list of allocated nodes using the bfqd

487

-+			 * filed, that is still unused and will be initialized

488

-+			 * only after the node will be connected.

489

-+			 */

490

-+			prev->bfqd = bfqg;

491

-+			prev = bfqg;

492

-+		}

493

-+	}

494

-+

495

-+	return leaf;

496

-+

497

-+cleanup:

498

-+	while (leaf != NULL) {

499

-+		prev = leaf;

500

-+		leaf = leaf->bfqd;

501

-+		kfree(prev);

502

-+	}

503

-+

504

-+	return NULL;

505

-+}

506

-+

507

-+/**

508

-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

509

-+ * @bfqd: the queue descriptor.

510

-+ * @css: the leaf cgroup_subsys_state to start from.

511

-+ * @leaf: the leaf group (to be associated to @cgroup).

512

-+ *

513

-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the

514

-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the

515

-+ * hierarchy that already as a group associated to @bfqd all the nodes

516

-+ * in the path to the root cgroup have one too.

517

-+ *

518

-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy

519

-+ * per device) while the bfqio_cgroup lock protects the list of groups

520

-+ * belonging to the same cgroup.

521

-+ */

522

-+static void bfq_group_chain_link(struct bfq_data *bfqd,

523

-+				 struct cgroup_subsys_state *css,

524

-+				 struct bfq_group *leaf)

525

-+{

526

-+	struct bfqio_cgroup *bgrp;

527

-+	struct bfq_group *bfqg, *next, *prev = NULL;

528

-+	unsigned long flags;

529

-+

530

-+	assert_spin_locked(bfqd->queue->queue_lock);

531

-+

532

-+	for (; css != NULL && leaf != NULL; css = css->parent) {

533

-+		bgrp = css_to_bfqio(css);

534

-+		next = leaf->bfqd;

535

-+

536

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

537

-+		BUG_ON(bfqg != NULL);

538

-+

539

-+		spin_lock_irqsave(&bgrp->lock, flags);

540

-+

541

-+		rcu_assign_pointer(leaf->bfqd, bfqd);

542

-+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

543

-+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

544

-+

545

-+		spin_unlock_irqrestore(&bgrp->lock, flags);

546

-+

547

-+		prev = leaf;

548

-+		leaf = next;

549

-+	}

550

-+

551

-+	BUG_ON(css == NULL && leaf != NULL);

552

-+	if (css != NULL && prev != NULL) {

553

-+		bgrp = css_to_bfqio(css);

554

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

555

-+		bfq_group_set_parent(prev, bfqg);

556

-+	}

557

-+}

558

-+

559

-+/**

560

-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

561

-+ * @bfqd: queue descriptor.

562

-+ * @cgroup: cgroup being searched for.

563

-+ *

564

-+ * Return a group associated to @bfqd in @cgroup, allocating one if

565

-+ * necessary.  When a group is returned all the cgroups in the path

566

-+ * to the root have a group associated to @bfqd.

567

-+ *

568

-+ * If the allocation fails, return the root group: this breaks guarantees

569

-+ * but is a safe fallbak.  If this loss becames a problem it can be

570

-+ * mitigated using the equivalent weight (given by the product of the

571

-+ * weights of the groups in the path from @group to the root) in the

572

-+ * root scheduler.

573

-+ *

574

-+ * We allocate all the missing nodes in the path from the leaf cgroup

575

-+ * to the root and we connect the nodes only after all the allocations

576

-+ * have been successful.

577

-+ */

578

-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

579

-+					      struct cgroup_subsys_state *css)

580

-+{

581

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

582

-+	struct bfq_group *bfqg;

583

-+

584

-+	bfqg = bfqio_lookup_group(bgrp, bfqd);

585

-+	if (bfqg != NULL)

586

-+		return bfqg;

587

-+

588

-+	bfqg = bfq_group_chain_alloc(bfqd, css);

589

-+	if (bfqg != NULL)

590

-+		bfq_group_chain_link(bfqd, css, bfqg);

591

-+	else

592

-+		bfqg = bfqd->root_group;

593

-+

594

-+	return bfqg;

595

-+}

596

-+

597

-+/**

598

-+ * bfq_bfqq_move - migrate @bfqq to @bfqg.

599

-+ * @bfqd: queue descriptor.

600

-+ * @bfqq: the queue to move.

601

-+ * @entity: @bfqq's entity.

602

-+ * @bfqg: the group to move to.

603

-+ *

604

-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

605

-+ * it on the new one.  Avoid putting the entity on the old group idle tree.

606

-+ *

607

-+ * Must be called under the queue lock; the cgroup owning @bfqg must

608

-+ * not disappear (by now this just means that we are called under

609

-+ * rcu_read_lock()).

610

-+ */

611

-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

612

-+			  struct bfq_entity *entity, struct bfq_group *bfqg)

613

-+{

614

-+	int busy, resume;

615

-+

616

-+	busy = bfq_bfqq_busy(bfqq);

617

-+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

618

-+

619

-+	BUG_ON(resume && !entity->on_st);

620

-+	BUG_ON(busy && !resume && entity->on_st &&

621

-+	       bfqq != bfqd->in_service_queue);

622

-+

623

-+	if (busy) {

624

-+		BUG_ON(atomic_read(&bfqq->ref) < 2);

625

-+

626

-+		if (!resume)

627

-+			bfq_del_bfqq_busy(bfqd, bfqq, 0);

628

-+		else

629

-+			bfq_deactivate_bfqq(bfqd, bfqq, 0);

630

-+	} else if (entity->on_st)

631

-+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

632

-+

633

-+	/*

634

-+	 * Here we use a reference to bfqg.  We don't need a refcounter

635

-+	 * as the cgroup reference will not be dropped, so that its

636

-+	 * destroy() callback will not be invoked.

637

-+	 */

638

-+	entity->parent = bfqg->my_entity;

639

-+	entity->sched_data = &bfqg->sched_data;

640

-+

641

-+	if (busy && resume)

642

-+		bfq_activate_bfqq(bfqd, bfqq);

643

-+

644

-+	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

645

-+		bfq_schedule_dispatch(bfqd);

646

-+}

647

-+

648

-+/**

649

-+ * __bfq_bic_change_cgroup - move @bic to @cgroup.

650

-+ * @bfqd: the queue descriptor.

651

-+ * @bic: the bic to move.

652

-+ * @cgroup: the cgroup to move to.

653

-+ *

654

-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

655

-+ * has to make sure that the reference to cgroup is valid across the call.

656

-+ *

657

-+ * NOTE: an alternative approach might have been to store the current

658

-+ * cgroup in bfqq and getting a reference to it, reducing the lookup

659

-+ * time here, at the price of slightly more complex code.

660

-+ */

661

-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

662

-+						struct bfq_io_cq *bic,

663

-+						struct cgroup_subsys_state *css)

664

-+{

665

-+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

666

-+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

667

-+	struct bfq_entity *entity;

668

-+	struct bfq_group *bfqg;

669

-+	struct bfqio_cgroup *bgrp;

670

-+

671

-+	bgrp = css_to_bfqio(css);

672

-+

673

-+	bfqg = bfq_find_alloc_group(bfqd, css);

674

-+	if (async_bfqq != NULL) {

675

-+		entity = &async_bfqq->entity;

676

-+

677

-+		if (entity->sched_data != &bfqg->sched_data) {

678

-+			bic_set_bfqq(bic, NULL, 0);

679

-+			bfq_log_bfqq(bfqd, async_bfqq,

680

-+				     "bic_change_group: %p %d",

681

-+				     async_bfqq, atomic_read(&async_bfqq->ref));

682

-+			bfq_put_queue(async_bfqq);

683

-+		}

684

-+	}

685

-+

686

-+	if (sync_bfqq != NULL) {

687

-+		entity = &sync_bfqq->entity;

688

-+		if (entity->sched_data != &bfqg->sched_data)

689

-+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

690

-+	}

691

-+

692

-+	return bfqg;

693

-+}

694

-+

695

-+/**

696

-+ * bfq_bic_change_cgroup - move @bic to @cgroup.

697

-+ * @bic: the bic being migrated.

698

-+ * @cgroup: the destination cgroup.

699

-+ *

700

-+ * When the task owning @bic is moved to @cgroup, @bic is immediately

701

-+ * moved into its new parent group.

702

-+ */

703

-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

704

-+				  struct cgroup_subsys_state *css)

705

-+{

706

-+	struct bfq_data *bfqd;

707

-+	unsigned long uninitialized_var(flags);

708

-+

709

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

710

-+				   &flags);

711

-+	if (bfqd != NULL) {

712

-+		__bfq_bic_change_cgroup(bfqd, bic, css);

713

-+		bfq_put_bfqd_unlock(bfqd, &flags);

714

-+	}

715

-+}

716

-+

717

-+/**

718

-+ * bfq_bic_update_cgroup - update the cgroup of @bic.

719

-+ * @bic: the @bic to update.

720

-+ *

721

-+ * Make sure that @bic is enqueued in the cgroup of the current task.

722

-+ * We need this in addition to moving bics during the cgroup attach

723

-+ * phase because the task owning @bic could be at its first disk

724

-+ * access or we may end up in the root cgroup as the result of a

725

-+ * memory allocation failure and here we try to move to the right

726

-+ * group.

727

-+ *

728

-+ * Must be called under the queue lock.  It is safe to use the returned

729

-+ * value even after the rcu_read_unlock() as the migration/destruction

730

-+ * paths act under the queue lock too.  IOW it is impossible to race with

731

-+ * group migration/destruction and end up with an invalid group as:

732

-+ *   a) here cgroup has not yet been destroyed, nor its destroy callback

733

-+ *      has started execution, as current holds a reference to it,

734

-+ *   b) if it is destroyed after rcu_read_unlock() [after current is

735

-+ *      migrated to a different cgroup] its attach() callback will have

736

-+ *      taken care of remove all the references to the old cgroup data.

737

-+ */

738

-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

739

-+{

740

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

741

-+	struct bfq_group *bfqg;

742

-+	struct cgroup_subsys_state *css;

743

-+

744

-+	BUG_ON(bfqd == NULL);

745

-+

746

-+	rcu_read_lock();

747

-+	css = task_css(current, bfqio_subsys_id);

748

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

749

-+	rcu_read_unlock();

750

-+

751

-+	return bfqg;

752

-+}

753

-+

754

-+/**

755

-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

756

-+ * @st: the service tree being flushed.

757

-+ */

758

-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

759

-+{

760

-+	struct bfq_entity *entity = st->first_idle;

761

-+

762

-+	for (; entity != NULL; entity = st->first_idle)

763

-+		__bfq_deactivate_entity(entity, 0);

764

-+}

765

-+

766

-+/**

767

-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

768

-+ * @bfqd: the device data structure with the root group.

769

-+ * @entity: the entity to move.

770

-+ */

771

-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

772

-+					    struct bfq_entity *entity)

773

-+{

774

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

775

-+

776

-+	BUG_ON(bfqq == NULL);

777

-+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

778

-+	return;

779

-+}

780

-+

781

-+/**

782

-+ * bfq_reparent_active_entities - move to the root group all active entities.

783

-+ * @bfqd: the device data structure with the root group.

784

-+ * @bfqg: the group to move from.

785

-+ * @st: the service tree with the entities.

786

-+ *

787

-+ * Needs queue_lock to be taken and reference to be valid over the call.

788

-+ */

789

-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

790

-+						struct bfq_group *bfqg,

791

-+						struct bfq_service_tree *st)

792

-+{

793

-+	struct rb_root *active = &st->active;

794

-+	struct bfq_entity *entity = NULL;

795

-+

796

-+	if (!RB_EMPTY_ROOT(&st->active))

797

-+		entity = bfq_entity_of(rb_first(active));

798

-+

799

-+	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

800

-+		bfq_reparent_leaf_entity(bfqd, entity);

801

-+

802

-+	if (bfqg->sched_data.active_entity != NULL)

803

-+		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

804

-+

805

-+	return;

806

-+}

807

-+

808

-+/**

809

-+ * bfq_destroy_group - destroy @bfqg.

810

-+ * @bgrp: the bfqio_cgroup containing @bfqg.

811

-+ * @bfqg: the group being destroyed.

812

-+ *

813

-+ * Destroy @bfqg, making sure that it is not referenced from its parent.

814

-+ */

815

-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

816

-+{

817

-+	struct bfq_data *bfqd;

818

-+	struct bfq_service_tree *st;

819

-+	struct bfq_entity *entity = bfqg->my_entity;

820

-+	unsigned long uninitialized_var(flags);

821

-+	int i;

822

-+

823

-+	hlist_del(&bfqg->group_node);

824

-+

825

-+	/*

826

-+	 * Empty all service_trees belonging to this group before deactivating

827

-+	 * the group itself.

828

-+	 */

829

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

830

-+		st = bfqg->sched_data.service_tree + i;

831

-+

832

-+		/*

833

-+		 * The idle tree may still contain bfq_queues belonging

834

-+		 * to exited task because they never migrated to a different

835

-+		 * cgroup from the one being destroyed now.  Noone else

836

-+		 * can access them so it's safe to act without any lock.

837

-+		 */

838

-+		bfq_flush_idle_tree(st);

839

-+

840

-+		/*

841

-+		 * It may happen that some queues are still active

842

-+		 * (busy) upon group destruction (if the corresponding

843

-+		 * processes have been forced to terminate). We move

844

-+		 * all the leaf entities corresponding to these queues

845

-+		 * to the root_group.

846

-+		 * Also, it may happen that the group has an entity

847

-+		 * under service, which is disconnected from the active

848

-+		 * tree: it must be moved, too.

849

-+		 * There is no need to put the sync queues, as the

850

-+		 * scheduler has taken no reference.

851

-+		 */

852

-+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

853

-+		if (bfqd != NULL) {

854

-+			bfq_reparent_active_entities(bfqd, bfqg, st);

855

-+			bfq_put_bfqd_unlock(bfqd, &flags);

856

-+		}

857

-+		BUG_ON(!RB_EMPTY_ROOT(&st->active));

858

-+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

859

-+	}

860

-+	BUG_ON(bfqg->sched_data.next_active != NULL);

861

-+	BUG_ON(bfqg->sched_data.active_entity != NULL);

862

-+

863

-+	/*

864

-+	 * We may race with device destruction, take extra care when

865

-+	 * dereferencing bfqg->bfqd.

866

-+	 */

867

-+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

868

-+	if (bfqd != NULL) {

869

-+		hlist_del(&bfqg->bfqd_node);

870

-+		__bfq_deactivate_entity(entity, 0);

871

-+		bfq_put_async_queues(bfqd, bfqg);

872

-+		bfq_put_bfqd_unlock(bfqd, &flags);

873

-+	}

874

-+	BUG_ON(entity->tree != NULL);

875

-+

876

-+	/*

877

-+	 * No need to defer the kfree() to the end of the RCU grace

878

-+	 * period: we are called from the destroy() callback of our

879

-+	 * cgroup, so we can be sure that noone is a) still using

880

-+	 * this cgroup or b) doing lookups in it.

881

-+	 */

882

-+	kfree(bfqg);

883

-+}

884

-+

885

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

886

-+{

887

-+	struct hlist_node *tmp;

888

-+	struct bfq_group *bfqg;

889

-+

890

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

891

-+		bfq_end_raising_async_queues(bfqd, bfqg);

892

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

893

-+}

894

-+

895

-+/**

896

-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

897

-+ * @bfqd: the device descriptor being exited.

898

-+ *

899

-+ * When the device exits we just make sure that no lookup can return

900

-+ * the now unused group structures.  They will be deallocated on cgroup

901

-+ * destruction.

902

-+ */

903

-+static void bfq_disconnect_groups(struct bfq_data *bfqd)

904

-+{

905

-+	struct hlist_node *tmp;

906

-+	struct bfq_group *bfqg;

907

-+

908

-+	bfq_log(bfqd, "disconnect_groups beginning");

909

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

910

-+		hlist_del(&bfqg->bfqd_node);

911

-+

912

-+		__bfq_deactivate_entity(bfqg->my_entity, 0);

913

-+

914

-+		/*

915

-+		 * Don't remove from the group hash, just set an

916

-+		 * invalid key.  No lookups can race with the

917

-+		 * assignment as bfqd is being destroyed; this

918

-+		 * implies also that new elements cannot be added

919

-+		 * to the list.

920

-+		 */

921

-+		rcu_assign_pointer(bfqg->bfqd, NULL);

922

-+

923

-+		bfq_log(bfqd, "disconnect_groups: put async for group %p",

924

-+			bfqg);

925

-+		bfq_put_async_queues(bfqd, bfqg);

926

-+	}

927

-+}

928

-+

929

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

930

-+{

931

-+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

932

-+	struct bfq_group *bfqg = bfqd->root_group;

933

-+

934

-+	bfq_put_async_queues(bfqd, bfqg);

935

-+

936

-+	spin_lock_irq(&bgrp->lock);

937

-+	hlist_del_rcu(&bfqg->group_node);

938

-+	spin_unlock_irq(&bgrp->lock);

939

-+

940

-+	/*

941

-+	 * No need to synchronize_rcu() here: since the device is gone

942

-+	 * there cannot be any read-side access to its root_group.

943

-+	 */

944

-+	kfree(bfqg);

945

-+}

946

-+

947

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

948

-+{

949

-+	struct bfq_group *bfqg;

950

-+	struct bfqio_cgroup *bgrp;

951

-+	int i;

952

-+

953

-+	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

954

-+	if (bfqg == NULL)

955

-+		return NULL;

956

-+

957

-+	bfqg->entity.parent = NULL;

958

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

959

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

960

-+

961

-+	bgrp = &bfqio_root_cgroup;

962

-+	spin_lock_irq(&bgrp->lock);

963

-+	rcu_assign_pointer(bfqg->bfqd, bfqd);

964

-+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

965

-+	spin_unlock_irq(&bgrp->lock);

966

-+

967

-+	return bfqg;

968

-+}

969

-+

970

-+#define SHOW_FUNCTION(__VAR)						\

971

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

972

-+				       struct cftype *cftype)		\

973

-+{									\

974

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

975

-+	u64 ret = -ENODEV;						\

976

-+									\

977

-+	mutex_lock(&bfqio_mutex);					\

978

-+	if (bfqio_is_removed(bgrp))					\

979

-+		goto out_unlock;					\

980

-+									\

981

-+	spin_lock_irq(&bgrp->lock);					\

982

-+	ret = bgrp->__VAR;						\

983

-+	spin_unlock_irq(&bgrp->lock);					\

984

-+									\

985

-+out_unlock:								\

986

-+	mutex_unlock(&bfqio_mutex);					\

987

-+	return ret;							\

988

-+}

989

-+

990

-+SHOW_FUNCTION(weight);

991

-+SHOW_FUNCTION(ioprio);

992

-+SHOW_FUNCTION(ioprio_class);

993

-+#undef SHOW_FUNCTION

994

-+

995

-+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

996

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

997

-+					struct cftype *cftype,		\

998

-+					u64 val)			\

999

-+{									\

1000

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

1001

-+	struct bfq_group *bfqg;						\

1002

-+	int ret = -EINVAL;						\

1003

-+									\

1004

-+	if (val < (__MIN) || val > (__MAX))				\

1005

-+		return ret;						\

1006

-+									\

1007

-+	ret = -ENODEV;							\

1008

-+	mutex_lock(&bfqio_mutex);					\

1009

-+	if (bfqio_is_removed(bgrp))					\

1010

-+		goto out_unlock;					\

1011

-+	ret = 0;							\

1012

-+									\

1013

-+	spin_lock_irq(&bgrp->lock);					\

1014

-+	bgrp->__VAR = (unsigned short)val;				\

1015

-+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

1016

-+		/*							\

1017

-+		 * Setting the ioprio_changed flag of the entity        \

1018

-+		 * to 1 with new_##__VAR == ##__VAR would re-set        \

1019

-+		 * the value of the weight to its ioprio mapping.       \

1020

-+		 * Set the flag only if necessary.                      \

1021

-+		 */                                                     \

1022

-+		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

1023

-+			bfqg->entity.new_##__VAR = (unsigned short)val; \

1024

-+			smp_wmb();                                      \

1025

-+			bfqg->entity.ioprio_changed = 1;                \

1026

-+		}							\

1027

-+	}								\

1028

-+	spin_unlock_irq(&bgrp->lock);					\

1029

-+									\

1030

-+out_unlock:								\

1031

-+	mutex_unlock(&bfqio_mutex);					\

1032

-+	return ret;							\

1033

-+}

1034

-+

1035

-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

1036

-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

1037

-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

1038

-+#undef STORE_FUNCTION

1039

-+

1040

-+static struct cftype bfqio_files[] = {

1041

-+	{

1042

-+		.name = "weight",

1043

-+		.read_u64 = bfqio_cgroup_weight_read,

1044

-+		.write_u64 = bfqio_cgroup_weight_write,

1045

-+	},

1046

-+	{

1047

-+		.name = "ioprio",

1048

-+		.read_u64 = bfqio_cgroup_ioprio_read,

1049

-+		.write_u64 = bfqio_cgroup_ioprio_write,

1050

-+	},

1051

-+	{

1052

-+		.name = "ioprio_class",

1053

-+		.read_u64 = bfqio_cgroup_ioprio_class_read,

1054

-+		.write_u64 = bfqio_cgroup_ioprio_class_write,

1055

-+	},

1056

-+	{ },	/* terminate */

1057

-+};

1058

-+

1059

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

1060

-+						*parent_css)

1061

-+{

1062

-+	struct bfqio_cgroup *bgrp;

1063

-+

1064

-+	if (parent_css != NULL) {

1065

-+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

1066

-+		if (bgrp == NULL)

1067

-+			return ERR_PTR(-ENOMEM);

1068

-+	} else

1069

-+		bgrp = &bfqio_root_cgroup;

1070

-+

1071

-+	spin_lock_init(&bgrp->lock);

1072

-+	INIT_HLIST_HEAD(&bgrp->group_data);

1073

-+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

1074

-+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

1075

-+

1076

-+	return &bgrp->css;

1077

-+}

1078

-+

1079

-+/*

1080

-+ * We cannot support shared io contexts, as we have no means to support

1081

-+ * two tasks with the same ioc in two different groups without major rework

1082

-+ * of the main bic/bfqq data structures.  By now we allow a task to change

1083

-+ * its cgroup only if it's the only owner of its ioc; the drawback of this

1084

-+ * behavior is that a group containing a task that forked using CLONE_IO

1085

-+ * will not be destroyed until the tasks sharing the ioc die.

1086

-+ */

1087

-+static int bfqio_can_attach(struct cgroup_subsys_state *css,

1088

-+			    struct cgroup_taskset *tset)

1089

-+{

1090

-+	struct task_struct *task;

1091

-+	struct io_context *ioc;

1092

-+	int ret = 0;

1093

-+

1094

-+	cgroup_taskset_for_each(task, css, tset) {

1095

-+		/*

1096

-+		 * task_lock() is needed to avoid races with

1097

-+		 * exit_io_context()

1098

-+		 */

1099

-+		task_lock(task);

1100

-+		ioc = task->io_context;

1101

-+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1102

-+			/*

1103

-+			 * ioc == NULL means that the task is either too young

1104

-+			 * or exiting: if it has still no ioc the ioc can't be

1105

-+			 * shared, if the task is exiting the attach will fail

1106

-+			 * anyway, no matter what we return here.

1107

-+			 */

1108

-+			ret = -EINVAL;

1109

-+		task_unlock(task);

1110

-+		if (ret)

1111

-+			break;

1112

-+	}

1113

-+

1114

-+	return ret;

1115

-+}

1116

-+

1117

-+static void bfqio_attach(struct cgroup_subsys_state *css,

1118

-+			 struct cgroup_taskset *tset)

1119

-+{

1120

-+	struct task_struct *task;

1121

-+	struct io_context *ioc;

1122

-+	struct io_cq *icq;

1123

-+

1124

-+	/*

1125

-+	 * IMPORTANT NOTE: The move of more than one process at a time to a

1126

-+	 * new group has not yet been tested.

1127

-+	 */

1128

-+	cgroup_taskset_for_each(task, css, tset) {

1129

-+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1130

-+		if (ioc) {

1131

-+			/*

1132

-+			 * Handle cgroup change here.

1133

-+			 */

1134

-+			rcu_read_lock();

1135

-+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1136

-+				if (!strncmp(

1137

-+					icq->q->elevator->type->elevator_name,

1138

-+					"bfq", ELV_NAME_MAX))

1139

-+					bfq_bic_change_cgroup(icq_to_bic(icq),

1140

-+							      css);

1141

-+			rcu_read_unlock();

1142

-+			put_io_context(ioc);

1143

-+		}

1144

-+	}

1145

-+}

1146

-+

1147

-+static void bfqio_destroy(struct cgroup_subsys_state *css)

1148

-+{

1149

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1150

-+	struct hlist_node *tmp;

1151

-+	struct bfq_group *bfqg;

1152

-+

1153

-+	/*

1154

-+	 * Since we are destroying the cgroup, there are no more tasks

1155

-+	 * referencing it, and all the RCU grace periods that may have

1156

-+	 * referenced it are ended (as the destruction of the parent

1157

-+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1158

-+	 * anything else and we don't need any synchronization.

1159

-+	 */

1160

-+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1161

-+		bfq_destroy_group(bgrp, bfqg);

1162

-+

1163

-+	BUG_ON(!hlist_empty(&bgrp->group_data));

1164

-+

1165

-+	kfree(bgrp);

1166

-+}

1167

-+

1168

-+static int bfqio_css_online(struct cgroup_subsys_state *css)

1169

-+{

1170

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1171

-+

1172

-+	mutex_lock(&bfqio_mutex);

1173

-+	bgrp->online = true;

1174

-+	mutex_unlock(&bfqio_mutex);

1175

-+

1176

-+	return 0;

1177

-+}

1178

-+

1179

-+static void bfqio_css_offline(struct cgroup_subsys_state *css)

1180

-+{

1181

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1182

-+

1183

-+	mutex_lock(&bfqio_mutex);

1184

-+	bgrp->online = false;

1185

-+	mutex_unlock(&bfqio_mutex);

1186

-+}

1187

-+

1188

-+struct cgroup_subsys bfqio_subsys = {

1189

-+	.name = "bfqio",

1190

-+	.css_alloc = bfqio_create,

1191

-+	.css_online = bfqio_css_online,

1192

-+	.css_offline = bfqio_css_offline,

1193

-+	.can_attach = bfqio_can_attach,

1194

-+	.attach = bfqio_attach,

1195

-+	.css_free = bfqio_destroy,

1196

-+	.subsys_id = bfqio_subsys_id,

1197

-+	.base_cftypes = bfqio_files,

1198

-+};

1199

-+#else

1200

-+static inline void bfq_init_entity(struct bfq_entity *entity,

1201

-+				   struct bfq_group *bfqg)

1202

-+{

1203

-+	entity->weight = entity->new_weight;

1204

-+	entity->orig_weight = entity->new_weight;

1205

-+	entity->ioprio = entity->new_ioprio;

1206

-+	entity->ioprio_class = entity->new_ioprio_class;

1207

-+	entity->sched_data = &bfqg->sched_data;

1208

-+}

1209

-+

1210

-+static inline struct bfq_group *

1211

-+bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1212

-+{

1213

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

1214

-+	return bfqd->root_group;

1215

-+}

1216

-+

1217

-+static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1218

-+				 struct bfq_queue *bfqq,

1219

-+				 struct bfq_entity *entity,

1220

-+				 struct bfq_group *bfqg)

1221

-+{

1222

-+}

1223

-+

1224

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

1225

-+{

1226

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

1227

-+}

1228

-+

1229

-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1230

-+{

1231

-+	bfq_put_async_queues(bfqd, bfqd->root_group);

1232

-+}

1233

-+

1234

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

1235

-+{

1236

-+	kfree(bfqd->root_group);

1237

-+}

1238

-+

1239

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1240

-+{

1241

-+	struct bfq_group *bfqg;

1242

-+	int i;

1243

-+

1244

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1245

-+	if (bfqg == NULL)

1246

-+		return NULL;

1247

-+

1248

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1249

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1250

-+

1251

-+	return bfqg;

1252

-+}

1253

-+#endif

1254

-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1255

-new file mode 100644

1256

-index 0000000..7f6b000

1257

---- /dev/null

1258

-+++ b/block/bfq-ioc.c

1259

-@@ -0,0 +1,36 @@

1260

-+/*

1261

-+ * BFQ: I/O context handling.

1262

-+ *

1263

-+ * Based on ideas and code from CFQ:

1264

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1265

-+ *

1266

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1267

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1268

-+ *

1269

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1270

-+ */

1271

-+

1272

-+/**

1273

-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1274

-+ * @icq: the iocontext queue.

1275

-+ */

1276

-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1277

-+{

1278

-+	/* bic->icq is the first member, %NULL will convert to %NULL */

1279

-+	return container_of(icq, struct bfq_io_cq, icq);

1280

-+}

1281

-+

1282

-+/**

1283

-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1284

-+ * @bfqd: the lookup key.

1285

-+ * @ioc: the io_context of the process doing I/O.

1286

-+ *

1287

-+ * Queue lock must be held.

1288

-+ */

1289

-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1290

-+					       struct io_context *ioc)

1291

-+{

1292

-+	if (ioc)

1293

-+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1294

-+	return NULL;

1295

-+}

1296

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1297

-new file mode 100644

1298

-index 0000000..7670400

1299

---- /dev/null

1300

-+++ b/block/bfq-iosched.c

1301

-@@ -0,0 +1,3268 @@

1302

-+/*

1303

-+ * BFQ, or Budget Fair Queueing, disk scheduler.

1304

-+ *

1305

-+ * Based on ideas and code from CFQ:

1306

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1307

-+ *

1308

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1309

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1310

-+ *

1311

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1312

-+ *

1313

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

1314

-+ *

1315

-+ * BFQ is a proportional share disk scheduling algorithm based on the

1316

-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

1317

-+ * number of sectors, to tasks instead of time slices. The disk is not granted

1318

-+ * to the in-service task for a given time slice, but until it has exahusted

1319

-+ * its assigned budget. This change from the time to the service domain allows

1320

-+ * BFQ to distribute the disk bandwidth among tasks as desired, without any

1321

-+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

1322

-+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

1323

-+ * their budgets (more precisely BFQ schedules queues associated to tasks).

1324

-+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

1325

-+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

1326

-+ * latencies to interactive and soft real-time applications.

1327

-+ *

1328

-+ * BFQ is described in [1], where also a reference to the initial, more

1329

-+ * theoretical paper on BFQ can be found. The interested reader can find in

1330

-+ * the latter paper full details on the main algorithm as well as formulas of

1331

-+ * the guarantees, plus formal proofs of all the properties. With respect to

1332

-+ * the version of BFQ presented in these papers, this implementation adds a

1333

-+ * few more heuristics, such as the one that guarantees a low latency to soft

1334

-+ * real-time applications, and a hierarchical extension based on H-WF2Q+.

1335

-+ *

1336

-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1337

-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1338

-+ * complexity derives from the one introduced with EEVDF in [3].

1339

-+ *

1340

-+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1341

-+ *     with the BFQ Disk I/O Scheduler'',

1342

-+ *     Proceedings of the 5th Annual International Systems and Storage

1343

-+ *     Conference (SYSTOR '12), June 2012.

1344

-+ *

1345

-+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1346

-+ *

1347

-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1348

-+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1349

-+ *     Oct 1997.

1350

-+ *

1351

-+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1352

-+ *

1353

-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1354

-+ *     First: A Flexible and Accurate Mechanism for Proportional Share

1355

-+ *     Resource Allocation,'' technical report.

1356

-+ *

1357

-+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1358

-+ */

1359

-+#include <linux/module.h>

1360

-+#include <linux/slab.h>

1361

-+#include <linux/blkdev.h>

1362

-+#include <linux/cgroup.h>

1363

-+#include <linux/elevator.h>

1364

-+#include <linux/jiffies.h>

1365

-+#include <linux/rbtree.h>

1366

-+#include <linux/ioprio.h>

1367

-+#include "bfq.h"

1368

-+#include "blk.h"

1369

-+

1370

-+/* Max number of dispatches in one round of service. */

1371

-+static const int bfq_quantum = 4;

1372

-+

1373

-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1374

-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1375

-+

1376

-+/* Maximum backwards seek, in KiB. */

1377

-+static const int bfq_back_max = 16 * 1024;

1378

-+

1379

-+/* Penalty of a backwards seek, in number of sectors. */

1380

-+static const int bfq_back_penalty = 2;

1381

-+

1382

-+/* Idling period duration, in jiffies. */

1383

-+static int bfq_slice_idle = HZ / 125;

1384

-+

1385

-+/* Default maximum budget values, in sectors and number of requests. */

1386

-+static const int bfq_default_max_budget = 16 * 1024;

1387

-+static const int bfq_max_budget_async_rq = 4;

1388

-+

1389

-+/*

1390

-+ * Async to sync throughput distribution is controlled as follows:

1391

-+ * when an async request is served, the entity is charged the number

1392

-+ * of sectors of the request, multipled by the factor below

1393

-+ */

1394

-+static const int bfq_async_charge_factor = 10;

1395

-+

1396

-+/* Default timeout values, in jiffies, approximating CFQ defaults. */

1397

-+static const int bfq_timeout_sync = HZ / 8;

1398

-+static int bfq_timeout_async = HZ / 25;

1399

-+

1400

-+struct kmem_cache *bfq_pool;

1401

-+

1402

-+/* Below this threshold (in ms), we consider thinktime immediate. */

1403

-+#define BFQ_MIN_TT		2

1404

-+

1405

-+/* hw_tag detection: parallel requests threshold and min samples needed. */

1406

-+#define BFQ_HW_QUEUE_THRESHOLD	4

1407

-+#define BFQ_HW_QUEUE_SAMPLES	32

1408

-+

1409

-+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1410

-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1411

-+

1412

-+/* Min samples used for peak rate estimation (for autotuning). */

1413

-+#define BFQ_PEAK_RATE_SAMPLES	32

1414

-+

1415

-+/* Shift used for peak rate fixed precision calculations. */

1416

-+#define BFQ_RATE_SHIFT		16

1417

-+

1418

-+/*

1419

-+ * The duration of the weight raising for interactive applications is

1420

-+ * computed automatically (as default behaviour), using the following

1421

-+ * formula: duration = (R / r) * T, where r is the peak rate of the

1422

-+ * disk, and R and T are two reference parameters. In particular, R is

1423

-+ * the peak rate of a reference disk, and T is about the maximum time

1424

-+ * for starting popular large applications on that disk, under BFQ and

1425

-+ * while reading two files in parallel. Finally, BFQ uses two

1426

-+ * different pairs (R, T) depending on whether the disk is rotational

1427

-+ * or non-rotational.

1428

-+ */

1429

-+#define T_rot			(msecs_to_jiffies(5500))

1430

-+#define T_nonrot		(msecs_to_jiffies(2000))

1431

-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

1432

-+#define R_rot			17415

1433

-+#define R_nonrot		34791

1434

-+

1435

-+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1436

-+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1437

-+

1438

-+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1439

-+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1440

-+

1441

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1442

-+

1443

-+#include "bfq-ioc.c"

1444

-+#include "bfq-sched.c"

1445

-+#include "bfq-cgroup.c"

1446

-+

1447

-+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1448

-+				 IOPRIO_CLASS_IDLE)

1449

-+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1450

-+				 IOPRIO_CLASS_RT)

1451

-+

1452

-+#define bfq_sample_valid(samples)	((samples) > 80)

1453

-+

1454

-+/*

1455

-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1456

-+ * set (in which case it could also be a direct WRITE).

1457

-+ */

1458

-+static inline int bfq_bio_sync(struct bio *bio)

1459

-+{

1460

-+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1461

-+		return 1;

1462

-+

1463

-+	return 0;

1464

-+}

1465

-+

1466

-+/*

1467

-+ * Scheduler run of queue, if there are requests pending and no one in the

1468

-+ * driver that will restart queueing.

1469

-+ */

1470

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1471

-+{

1472

-+	if (bfqd->queued != 0) {

1473

-+		bfq_log(bfqd, "schedule dispatch");

1474

-+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

1475

-+	}

1476

-+}

1477

-+

1478

-+/*

1479

-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1480

-+ * We choose the request that is closesr to the head right now.  Distance

1481

-+ * behind the head is penalized and only allowed to a certain extent.

1482

-+ */

1483

-+static struct request *bfq_choose_req(struct bfq_data *bfqd,

1484

-+				      struct request *rq1,

1485

-+				      struct request *rq2,

1486

-+				      sector_t last)

1487

-+{

1488

-+	sector_t s1, s2, d1 = 0, d2 = 0;

1489

-+	unsigned long back_max;

1490

-+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1491

-+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1492

-+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1493

-+

1494

-+	if (rq1 == NULL || rq1 == rq2)

1495

-+		return rq2;

1496

-+	if (rq2 == NULL)

1497

-+		return rq1;

1498

-+

1499

-+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1500

-+		return rq1;

1501

-+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1502

-+		return rq2;

1503

-+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1504

-+		return rq1;

1505

-+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1506

-+		return rq2;

1507

-+

1508

-+	s1 = blk_rq_pos(rq1);

1509

-+	s2 = blk_rq_pos(rq2);

1510

-+

1511

-+	/*

1512

-+	 * By definition, 1KiB is 2 sectors.

1513

-+	 */

1514

-+	back_max = bfqd->bfq_back_max * 2;

1515

-+

1516

-+	/*

1517

-+	 * Strict one way elevator _except_ in the case where we allow

1518

-+	 * short backward seeks which are biased as twice the cost of a

1519

-+	 * similar forward seek.

1520

-+	 */

1521

-+	if (s1 >= last)

1522

-+		d1 = s1 - last;

1523

-+	else if (s1 + back_max >= last)

1524

-+		d1 = (last - s1) * bfqd->bfq_back_penalty;

1525

-+	else

1526

-+		wrap |= BFQ_RQ1_WRAP;

1527

-+

1528

-+	if (s2 >= last)

1529

-+		d2 = s2 - last;

1530

-+	else if (s2 + back_max >= last)

1531

-+		d2 = (last - s2) * bfqd->bfq_back_penalty;

1532

-+	else

1533

-+		wrap |= BFQ_RQ2_WRAP;

1534

-+

1535

-+	/* Found required data */

1536

-+

1537

-+	/*

1538

-+	 * By doing switch() on the bit mask "wrap" we avoid having to

1539

-+	 * check two variables for all permutations: --> faster!

1540

-+	 */

1541

-+	switch (wrap) {

1542

-+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1543

-+		if (d1 < d2)

1544

-+			return rq1;

1545

-+		else if (d2 < d1)

1546

-+			return rq2;

1547

-+		else {

1548

-+			if (s1 >= s2)

1549

-+				return rq1;

1550

-+			else

1551

-+				return rq2;

1552

-+		}

1553

-+

1554

-+	case BFQ_RQ2_WRAP:

1555

-+		return rq1;

1556

-+	case BFQ_RQ1_WRAP:

1557

-+		return rq2;

1558

-+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1559

-+	default:

1560

-+		/*

1561

-+		 * Since both rqs are wrapped,

1562

-+		 * start with the one that's further behind head

1563

-+		 * (--> only *one* back seek required),

1564

-+		 * since back seek takes more time than forward.

1565

-+		 */

1566

-+		if (s1 <= s2)

1567

-+			return rq1;

1568

-+		else

1569

-+			return rq2;

1570

-+	}

1571

-+}

1572

-+

1573

-+static struct bfq_queue *

1574

-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1575

-+		     sector_t sector, struct rb_node **ret_parent,

1576

-+		     struct rb_node ***rb_link)

1577

-+{

1578

-+	struct rb_node **p, *parent;

1579

-+	struct bfq_queue *bfqq = NULL;

1580

-+

1581

-+	parent = NULL;

1582

-+	p = &root->rb_node;

1583

-+	while (*p) {

1584

-+		struct rb_node **n;

1585

-+

1586

-+		parent = *p;

1587

-+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1588

-+

1589

-+		/*

1590

-+		 * Sort strictly based on sector. Smallest to the left,

1591

-+		 * largest to the right.

1592

-+		 */

1593

-+		if (sector > blk_rq_pos(bfqq->next_rq))

1594

-+			n = &(*p)->rb_right;

1595

-+		else if (sector < blk_rq_pos(bfqq->next_rq))

1596

-+			n = &(*p)->rb_left;

1597

-+		else

1598

-+			break;

1599

-+		p = n;

1600

-+		bfqq = NULL;

1601

-+	}

1602

-+

1603

-+	*ret_parent = parent;

1604

-+	if (rb_link)

1605

-+		*rb_link = p;

1606

-+

1607

-+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1608

-+		(long long unsigned)sector,

1609

-+		bfqq != NULL ? bfqq->pid : 0);

1610

-+

1611

-+	return bfqq;

1612

-+}

1613

-+

1614

-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1615

-+{

1616

-+	struct rb_node **p, *parent;

1617

-+	struct bfq_queue *__bfqq;

1618

-+

1619

-+	if (bfqq->pos_root != NULL) {

1620

-+		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1621

-+		bfqq->pos_root = NULL;

1622

-+	}

1623

-+

1624

-+	if (bfq_class_idle(bfqq))

1625

-+		return;

1626

-+	if (!bfqq->next_rq)

1627

-+		return;

1628

-+

1629

-+	bfqq->pos_root = &bfqd->rq_pos_tree;

1630

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1631

-+			blk_rq_pos(bfqq->next_rq), &parent, &p);

1632

-+	if (__bfqq == NULL) {

1633

-+		rb_link_node(&bfqq->pos_node, parent, p);

1634

-+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1635

-+	} else

1636

-+		bfqq->pos_root = NULL;

1637

-+}

1638

-+

1639

-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1640

-+					struct bfq_queue *bfqq,

1641

-+					struct request *last)

1642

-+{

1643

-+	struct rb_node *rbnext = rb_next(&last->rb_node);

1644

-+	struct rb_node *rbprev = rb_prev(&last->rb_node);

1645

-+	struct request *next = NULL, *prev = NULL;

1646

-+

1647

-+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1648

-+

1649

-+	if (rbprev != NULL)

1650

-+		prev = rb_entry_rq(rbprev);

1651

-+

1652

-+	if (rbnext != NULL)

1653

-+		next = rb_entry_rq(rbnext);

1654

-+	else {

1655

-+		rbnext = rb_first(&bfqq->sort_list);

1656

-+		if (rbnext && rbnext != &last->rb_node)

1657

-+			next = rb_entry_rq(rbnext);

1658

-+	}

1659

-+

1660

-+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1661

-+}

1662

-+

1663

-+static void bfq_del_rq_rb(struct request *rq)

1664

-+{

1665

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1666

-+	struct bfq_data *bfqd = bfqq->bfqd;

1667

-+	const int sync = rq_is_sync(rq);

1668

-+

1669

-+	BUG_ON(bfqq->queued[sync] == 0);

1670

-+	bfqq->queued[sync]--;

1671

-+	bfqd->queued--;

1672

-+

1673

-+	elv_rb_del(&bfqq->sort_list, rq);

1674

-+

1675

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

1676

-+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

1677

-+			bfq_del_bfqq_busy(bfqd, bfqq, 1);

1678

-+		/*

1679

-+		 * Remove queue from request-position tree as it is empty.

1680

-+		 */

1681

-+		if (bfqq->pos_root != NULL) {

1682

-+			rb_erase(&bfqq->pos_node, bfqq->pos_root);

1683

-+			bfqq->pos_root = NULL;

1684

-+		}

1685

-+	}

1686

-+}

1687

-+

1688

-+/* see the definition of bfq_async_charge_factor for details */

1689

-+static inline unsigned long bfq_serv_to_charge(struct request *rq,

1690

-+					       struct bfq_queue *bfqq)

1691

-+{

1692

-+	return blk_rq_sectors(rq) *

1693

-+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

1694

-+		bfq_async_charge_factor));

1695

-+}

1696

-+

1697

-+/**

1698

-+ * bfq_updated_next_req - update the queue after a new next_rq selection.

1699

-+ * @bfqd: the device data the queue belongs to.

1700

-+ * @bfqq: the queue to update.

1701

-+ *

1702

-+ * If the first request of a queue changes we make sure that the queue

1703

-+ * has enough budget to serve at least its first request (if the

1704

-+ * request has grown).  We do this because if the queue has not enough

1705

-+ * budget for its first request, it has to go through two dispatch

1706

-+ * rounds to actually get it dispatched.

1707

-+ */

1708

-+static void bfq_updated_next_req(struct bfq_data *bfqd,

1709

-+				 struct bfq_queue *bfqq)

1710

-+{

1711

-+	struct bfq_entity *entity = &bfqq->entity;

1712

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1713

-+	struct request *next_rq = bfqq->next_rq;

1714

-+	unsigned long new_budget;

1715

-+

1716

-+	if (next_rq == NULL)

1717

-+		return;

1718

-+

1719

-+	if (bfqq == bfqd->in_service_queue)

1720

-+		/*

1721

-+		 * In order not to break guarantees, budgets cannot be

1722

-+		 * changed after an entity has been selected.

1723

-+		 */

1724

-+		return;

1725

-+

1726

-+	BUG_ON(entity->tree != &st->active);

1727

-+	BUG_ON(entity == entity->sched_data->active_entity);

1728

-+

1729

-+	new_budget = max_t(unsigned long, bfqq->max_budget,

1730

-+			   bfq_serv_to_charge(next_rq, bfqq));

1731

-+	entity->budget = new_budget;

1732

-+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

1733

-+	bfq_activate_bfqq(bfqd, bfqq);

1734

-+}

1735

-+

1736

-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

1737

-+{

1738

-+	u64 dur;

1739

-+

1740

-+	if (bfqd->bfq_raising_max_time > 0)

1741

-+		return bfqd->bfq_raising_max_time;

1742

-+

1743

-+	dur = bfqd->RT_prod;

1744

-+	do_div(dur, bfqd->peak_rate);

1745

-+

1746

-+	return dur;

1747

-+}

1748

-+

1749

-+static void bfq_add_rq_rb(struct request *rq)

1750

-+{

1751

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1752

-+	struct bfq_entity *entity = &bfqq->entity;

1753

-+	struct bfq_data *bfqd = bfqq->bfqd;

1754

-+	struct request *next_rq, *prev;

1755

-+	unsigned long old_raising_coeff = bfqq->raising_coeff;

1756

-+	int idle_for_long_time = 0;

1757

-+

1758

-+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

1759

-+	bfqq->queued[rq_is_sync(rq)]++;

1760

-+	bfqd->queued++;

1761

-+

1762

-+	elv_rb_add(&bfqq->sort_list, rq);

1763

-+

1764

-+	/*

1765

-+	 * Check if this request is a better next-serve candidate.

1766

-+	 */

1767

-+	prev = bfqq->next_rq;

1768

-+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

1769

-+	BUG_ON(next_rq == NULL);

1770

-+	bfqq->next_rq = next_rq;

1771

-+

1772

-+	/*

1773

-+	 * Adjust priority tree position, if next_rq changes.

1774

-+	 */

1775

-+	if (prev != bfqq->next_rq)

1776

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

1777

-+

1778

-+	if (!bfq_bfqq_busy(bfqq)) {

1779

-+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

1780

-+			time_is_before_jiffies(bfqq->soft_rt_next_start);

1781

-+		idle_for_long_time = time_is_before_jiffies(

1782

-+			bfqq->budget_timeout +

1783

-+			bfqd->bfq_raising_min_idle_time);

1784

-+		entity->budget = max_t(unsigned long, bfqq->max_budget,

1785

-+				       bfq_serv_to_charge(next_rq, bfqq));

1786

-+

1787

-+		if (!bfqd->low_latency)

1788

-+			goto add_bfqq_busy;

1789

-+

1790

-+		/*

1791

-+		 * If the queue is not being boosted and has been idle

1792

-+		 * for enough time, start a weight-raising period

1793

-+		 */

1794

-+		if (old_raising_coeff == 1 &&

1795

-+		    (idle_for_long_time || soft_rt)) {

1796

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1797

-+			if (idle_for_long_time)

1798

-+				bfqq->raising_cur_max_time =

1799

-+					bfq_wrais_duration(bfqd);

1800

-+			else

1801

-+				bfqq->raising_cur_max_time =

1802

-+					bfqd->bfq_raising_rt_max_time;

1803

-+			bfq_log_bfqq(bfqd, bfqq,

1804

-+				     "wrais starting at %llu msec,"

1805

-+				     "rais_max_time %u",

1806

-+				     bfqq->last_rais_start_finish,

1807

-+				     jiffies_to_msecs(bfqq->

1808

-+					raising_cur_max_time));

1809

-+		} else if (old_raising_coeff > 1) {

1810

-+			if (idle_for_long_time)

1811

-+				bfqq->raising_cur_max_time =

1812

-+					bfq_wrais_duration(bfqd);

1813

-+			else if (bfqq->raising_cur_max_time ==

1814

-+				 bfqd->bfq_raising_rt_max_time &&

1815

-+				 !soft_rt) {

1816

-+				bfqq->raising_coeff = 1;

1817

-+				bfq_log_bfqq(bfqd, bfqq,

1818

-+					     "wrais ending at %llu msec,"

1819

-+					     "rais_max_time %u",

1820

-+					     bfqq->last_rais_start_finish,

1821

-+					     jiffies_to_msecs(bfqq->

1822

-+						raising_cur_max_time));

1823

-+			} else if ((bfqq->last_rais_start_finish +

1824

-+				    bfqq->raising_cur_max_time <

1825

-+				    jiffies + bfqd->bfq_raising_rt_max_time) &&

1826

-+				   soft_rt) {

1827

-+				/*

1828

-+				 *

1829

-+				 * The remaining weight-raising time is lower

1830

-+				 * than bfqd->bfq_raising_rt_max_time, which

1831

-+				 * means that the application is enjoying

1832

-+				 * weight raising either because deemed soft rt

1833

-+				 * in the near past, or because deemed

1834

-+				 * interactive a long ago. In both cases,

1835

-+				 * resetting now the current remaining weight-

1836

-+				 * raising time for the application to the

1837

-+				 * weight-raising duration for soft rt

1838

-+				 * applications would not cause any latency

1839

-+				 * increase for the application (as the new

1840

-+				 * duration would be higher than the remaining

1841

-+				 * time).

1842

-+				 *

1843

-+				 * In addition, the application is now meeting

1844

-+				 * the requirements for being deemed soft rt.

1845

-+				 * In the end we can correctly and safely

1846

-+				 * (re)charge the weight-raising duration for

1847

-+				 * the application with the weight-raising

1848

-+				 * duration for soft rt applications.

1849

-+				 *

1850

-+				 * In particular, doing this recharge now, i.e.,

1851

-+				 * before the weight-raising period for the

1852

-+				 * application finishes, reduces the probability

1853

-+				 * of the following negative scenario:

1854

-+				 * 1) the weight of a soft rt application is

1855

-+				 *    raised at startup (as for any newly

1856

-+				 *    created application),

1857

-+				 * 2) since the application is not interactive,

1858

-+				 *    at a certain time weight-raising is

1859

-+				 *    stopped for the application,

1860

-+				 * 3) at that time the application happens to

1861

-+				 *    still have pending requests, and hence

1862

-+				 *    is destined to not have a chance to be

1863

-+				 *    deemed soft rt before these requests are

1864

-+				 *    completed (see the comments to the

1865

-+				 *    function bfq_bfqq_softrt_next_start()

1866

-+				 *    for details on soft rt detection),

1867

-+				 * 4) these pending requests experience a high

1868

-+				 *    latency because the application is not

1869

-+				 *    weight-raised while they are pending.

1870

-+				 */

1871

-+				bfqq->last_rais_start_finish = jiffies;

1872

-+				bfqq->raising_cur_max_time =

1873

-+					bfqd->bfq_raising_rt_max_time;

1874

-+			}

1875

-+		}

1876

-+		if (old_raising_coeff != bfqq->raising_coeff)

1877

-+			entity->ioprio_changed = 1;

1878

-+add_bfqq_busy:

1879

-+		bfqq->last_idle_bklogged = jiffies;

1880

-+		bfqq->service_from_backlogged = 0;

1881

-+		bfq_clear_bfqq_softrt_update(bfqq);

1882

-+		bfq_add_bfqq_busy(bfqd, bfqq);

1883

-+	} else {

1884

-+		if (bfqd->low_latency && old_raising_coeff == 1 &&

1885

-+			!rq_is_sync(rq) &&

1886

-+			bfqq->last_rais_start_finish +

1887

-+			time_is_before_jiffies(

1888

-+				bfqd->bfq_raising_min_inter_arr_async)) {

1889

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1890

-+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

1891

-+

1892

-+			bfqd->raised_busy_queues++;

1893

-+			entity->ioprio_changed = 1;

1894

-+			bfq_log_bfqq(bfqd, bfqq,

1895

-+				     "non-idle wrais starting at %llu msec,"

1896

-+				     "rais_max_time %u",

1897

-+				     bfqq->last_rais_start_finish,

1898

-+				     jiffies_to_msecs(bfqq->

1899

-+					raising_cur_max_time));

1900

-+		}

1901

-+		bfq_updated_next_req(bfqd, bfqq);

1902

-+	}

1903

-+

1904

-+	if (bfqd->low_latency &&

1905

-+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

1906

-+		 idle_for_long_time))

1907

-+		bfqq->last_rais_start_finish = jiffies;

1908

-+}

1909

-+

1910

-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

1911

-+{

1912

-+	elv_rb_del(&bfqq->sort_list, rq);

1913

-+	bfqq->queued[rq_is_sync(rq)]--;

1914

-+	bfqq->bfqd->queued--;

1915

-+	bfq_add_rq_rb(rq);

1916

-+}

1917

-+

1918

-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

1919

-+					  struct bio *bio)

1920

-+{

1921

-+	struct task_struct *tsk = current;

1922

-+	struct bfq_io_cq *bic;

1923

-+	struct bfq_queue *bfqq;

1924

-+

1925

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

1926

-+	if (bic == NULL)

1927

-+		return NULL;

1928

-+

1929

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1930

-+	if (bfqq != NULL)

1931

-+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

1932

-+

1933

-+	return NULL;

1934

-+}

1935

-+

1936

-+static void bfq_activate_request(struct request_queue *q, struct request *rq)

1937

-+{

1938

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1939

-+

1940

-+	bfqd->rq_in_driver++;

1941

-+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1942

-+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

1943

-+		(long long unsigned)bfqd->last_position);

1944

-+}

1945

-+

1946

-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

1947

-+{

1948

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1949

-+

1950

-+	WARN_ON(bfqd->rq_in_driver == 0);

1951

-+	bfqd->rq_in_driver--;

1952

-+}

1953

-+

1954

-+static void bfq_remove_request(struct request *rq)

1955

-+{

1956

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1957

-+	struct bfq_data *bfqd = bfqq->bfqd;

1958

-+

1959

-+	if (bfqq->next_rq == rq) {

1960

-+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

1961

-+		bfq_updated_next_req(bfqd, bfqq);

1962

-+	}

1963

-+

1964

-+	list_del_init(&rq->queuelist);

1965

-+	bfq_del_rq_rb(rq);

1966

-+

1967

-+	if (rq->cmd_flags & REQ_META) {

1968

-+		WARN_ON(bfqq->meta_pending == 0);

1969

-+		bfqq->meta_pending--;

1970

-+	}

1971

-+}

1972

-+

1973

-+static int bfq_merge(struct request_queue *q, struct request **req,

1974

-+		     struct bio *bio)

1975

-+{

1976

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1977

-+	struct request *__rq;

1978

-+

1979

-+	__rq = bfq_find_rq_fmerge(bfqd, bio);

1980

-+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

1981

-+		*req = __rq;

1982

-+		return ELEVATOR_FRONT_MERGE;

1983

-+	}

1984

-+

1985

-+	return ELEVATOR_NO_MERGE;

1986

-+}

1987

-+

1988

-+static void bfq_merged_request(struct request_queue *q, struct request *req,

1989

-+			       int type)

1990

-+{

1991

-+	if (type == ELEVATOR_FRONT_MERGE) {

1992

-+		struct bfq_queue *bfqq = RQ_BFQQ(req);

1993

-+

1994

-+		bfq_reposition_rq_rb(bfqq, req);

1995

-+	}

1996

-+}

1997

-+

1998

-+static void bfq_merged_requests(struct request_queue *q, struct request *rq,

1999

-+				struct request *next)

2000

-+{

2001

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2002

-+

2003

-+	/*

2004

-+	 * Reposition in fifo if next is older than rq.

2005

-+	 */

2006

-+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2007

-+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

2008

-+		list_move(&rq->queuelist, &next->queuelist);

2009

-+		rq_set_fifo_time(rq, rq_fifo_time(next));

2010

-+	}

2011

-+

2012

-+	if (bfqq->next_rq == next)

2013

-+		bfqq->next_rq = rq;

2014

-+

2015

-+	bfq_remove_request(next);

2016

-+}

2017

-+

2018

-+/* Must be called with bfqq != NULL */

2019

-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

2020

-+{

2021

-+	BUG_ON(bfqq == NULL);

2022

-+	if (bfq_bfqq_busy(bfqq))

2023

-+		bfqq->bfqd->raised_busy_queues--;

2024

-+	bfqq->raising_coeff = 1;

2025

-+	bfqq->raising_cur_max_time = 0;

2026

-+	/* Trigger a weight change on the next activation of the queue */

2027

-+	bfqq->entity.ioprio_changed = 1;

2028

-+}

2029

-+

2030

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

2031

-+					struct bfq_group *bfqg)

2032

-+{

2033

-+	int i, j;

2034

-+

2035

-+	for (i = 0; i < 2; i++)

2036

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

2037

-+			if (bfqg->async_bfqq[i][j] != NULL)

2038

-+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

2039

-+	if (bfqg->async_idle_bfqq != NULL)

2040

-+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

2041

-+}

2042

-+

2043

-+static void bfq_end_raising(struct bfq_data *bfqd)

2044

-+{

2045

-+	struct bfq_queue *bfqq;

2046

-+

2047

-+	spin_lock_irq(bfqd->queue->queue_lock);

2048

-+

2049

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2050

-+		bfq_bfqq_end_raising(bfqq);

2051

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2052

-+		bfq_bfqq_end_raising(bfqq);

2053

-+	bfq_end_raising_async(bfqd);

2054

-+

2055

-+	spin_unlock_irq(bfqd->queue->queue_lock);

2056

-+}

2057

-+

2058

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2059

-+			   struct bio *bio)

2060

-+{

2061

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

2062

-+	struct bfq_io_cq *bic;

2063

-+	struct bfq_queue *bfqq;

2064

-+

2065

-+	/*

2066

-+	 * Disallow merge of a sync bio into an async request.

2067

-+	 */

2068

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2069

-+		return 0;

2070

-+

2071

-+	/*

2072

-+	 * Lookup the bfqq that this bio will be queued with. Allow

2073

-+	 * merge only if rq is queued there.

2074

-+	 * Queue lock is held here.

2075

-+	 */

2076

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

2077

-+	if (bic == NULL)

2078

-+		return 0;

2079

-+

2080

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2081

-+	return bfqq == RQ_BFQQ(rq);

2082

-+}

2083

-+

2084

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2085

-+				       struct bfq_queue *bfqq)

2086

-+{

2087

-+	if (bfqq != NULL) {

2088

-+		bfq_mark_bfqq_must_alloc(bfqq);

2089

-+		bfq_mark_bfqq_budget_new(bfqq);

2090

-+		bfq_clear_bfqq_fifo_expire(bfqq);

2091

-+

2092

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2093

-+

2094

-+		bfq_log_bfqq(bfqd, bfqq,

2095

-+			     "set_in_service_queue, cur-budget = %lu",

2096

-+			     bfqq->entity.budget);

2097

-+	}

2098

-+

2099

-+	bfqd->in_service_queue = bfqq;

2100

-+}

2101

-+

2102

-+/*

2103

-+ * Get and set a new queue for service.

2104

-+ */

2105

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2106

-+						  struct bfq_queue *bfqq)

2107

-+{

2108

-+	if (!bfqq)

2109

-+		bfqq = bfq_get_next_queue(bfqd);

2110

-+	else

2111

-+		bfq_get_next_queue_forced(bfqd, bfqq);

2112

-+

2113

-+	__bfq_set_in_service_queue(bfqd, bfqq);

2114

-+	return bfqq;

2115

-+}

2116

-+

2117

-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2118

-+					  struct request *rq)

2119

-+{

2120

-+	if (blk_rq_pos(rq) >= bfqd->last_position)

2121

-+		return blk_rq_pos(rq) - bfqd->last_position;

2122

-+	else

2123

-+		return bfqd->last_position - blk_rq_pos(rq);

2124

-+}

2125

-+

2126

-+/*

2127

-+ * Return true if bfqq has no request pending and rq is close enough to

2128

-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2129

-+ * bfqq->next_rq

2130

-+ */

2131

-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2132

-+{

2133

-+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2134

-+}

2135

-+

2136

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2137

-+{

2138

-+	struct rb_root *root = &bfqd->rq_pos_tree;

2139

-+	struct rb_node *parent, *node;

2140

-+	struct bfq_queue *__bfqq;

2141

-+	sector_t sector = bfqd->last_position;

2142

-+

2143

-+	if (RB_EMPTY_ROOT(root))

2144

-+		return NULL;

2145

-+

2146

-+	/*

2147

-+	 * First, if we find a request starting at the end of the last

2148

-+	 * request, choose it.

2149

-+	 */

2150

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2151

-+	if (__bfqq != NULL)

2152

-+		return __bfqq;

2153

-+

2154

-+	/*

2155

-+	 * If the exact sector wasn't found, the parent of the NULL leaf

2156

-+	 * will contain the closest sector (rq_pos_tree sorted by next_request

2157

-+	 * position).

2158

-+	 */

2159

-+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2160

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2161

-+		return __bfqq;

2162

-+

2163

-+	if (blk_rq_pos(__bfqq->next_rq) < sector)

2164

-+		node = rb_next(&__bfqq->pos_node);

2165

-+	else

2166

-+		node = rb_prev(&__bfqq->pos_node);

2167

-+	if (node == NULL)

2168

-+		return NULL;

2169

-+

2170

-+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2171

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2172

-+		return __bfqq;

2173

-+

2174

-+	return NULL;

2175

-+}

2176

-+

2177

-+/*

2178

-+ * bfqd - obvious

2179

-+ * cur_bfqq - passed in so that we don't decide that the current queue

2180

-+ *            is closely cooperating with itself.

2181

-+ *

2182

-+ * We are assuming that cur_bfqq has dispatched at least one request,

2183

-+ * and that bfqd->last_position reflects a position on the disk associated

2184

-+ * with the I/O issued by cur_bfqq.

2185

-+ */

2186

-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2187

-+					      struct bfq_queue *cur_bfqq)

2188

-+{

2189

-+	struct bfq_queue *bfqq;

2190

-+

2191

-+	if (bfq_class_idle(cur_bfqq))

2192

-+		return NULL;

2193

-+	if (!bfq_bfqq_sync(cur_bfqq))

2194

-+		return NULL;

2195

-+	if (BFQQ_SEEKY(cur_bfqq))

2196

-+		return NULL;

2197

-+

2198

-+	/* If device has only one backlogged bfq_queue, don't search. */

2199

-+	if (bfqd->busy_queues == 1)

2200

-+		return NULL;

2201

-+

2202

-+	/*

2203

-+	 * We should notice if some of the queues are cooperating, e.g.

2204

-+	 * working closely on the same area of the disk. In that case,

2205

-+	 * we can group them together and don't waste time idling.

2206

-+	 */

2207

-+	bfqq = bfqq_close(bfqd);

2208

-+	if (bfqq == NULL || bfqq == cur_bfqq)

2209

-+		return NULL;

2210

-+

2211

-+	/*

2212

-+	 * Do not merge queues from different bfq_groups.

2213

-+	*/

2214

-+	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2215

-+		return NULL;

2216

-+

2217

-+	/*

2218

-+	 * It only makes sense to merge sync queues.

2219

-+	 */

2220

-+	if (!bfq_bfqq_sync(bfqq))

2221

-+		return NULL;

2222

-+	if (BFQQ_SEEKY(bfqq))

2223

-+		return NULL;

2224

-+

2225

-+	/*

2226

-+	 * Do not merge queues of different priority classes.

2227

-+	 */

2228

-+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2229

-+		return NULL;

2230

-+

2231

-+	return bfqq;

2232

-+}

2233

-+

2234

-+/*

2235

-+ * If enough samples have been computed, return the current max budget

2236

-+ * stored in bfqd, which is dynamically updated according to the

2237

-+ * estimated disk peak rate; otherwise return the default max budget

2238

-+ */

2239

-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2240

-+{

2241

-+	if (bfqd->budgets_assigned < 194)

2242

-+		return bfq_default_max_budget;

2243

-+	else

2244

-+		return bfqd->bfq_max_budget;

2245

-+}

2246

-+

2247

-+/*

2248

-+ * Return min budget, which is a fraction of the current or default

2249

-+ * max budget (trying with 1/32)

2250

-+ */

2251

-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2252

-+{

2253

-+	if (bfqd->budgets_assigned < 194)

2254

-+		return bfq_default_max_budget / 32;

2255

-+	else

2256

-+		return bfqd->bfq_max_budget / 32;

2257

-+}

2258

-+

2259

-+/*

2260

-+ * Decides whether idling should be done for given device and

2261

-+ * given in-service queue.

2262

-+ */

2263

-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

2264

-+					   struct bfq_queue *in_service_bfqq)

2265

-+{

2266

-+	if (in_service_bfqq == NULL)

2267

-+		return false;

2268

-+	/*

2269

-+	 * If device is SSD it has no seek penalty, disable idling; but

2270

-+	 * do so only if:

2271

-+	 * - device does not support queuing, otherwise we still have

2272

-+	 *   a problem with sync vs async workloads;

2273

-+	 * - the queue is not weight-raised, to preserve guarantees.

2274

-+	 */

2275

-+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

2276

-+		in_service_bfqq->raising_coeff == 1);

2277

-+}

2278

-+

2279

-+static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2280

-+{

2281

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

2282

-+	struct bfq_io_cq *bic;

2283

-+	unsigned long sl;

2284

-+

2285

-+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2286

-+

2287

-+	/* Tasks have exited, don't wait. */

2288

-+	bic = bfqd->in_service_bic;

2289

-+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2290

-+		return;

2291

-+

2292

-+	bfq_mark_bfqq_wait_request(bfqq);

2293

-+

2294

-+	/*

2295

-+	 * We don't want to idle for seeks, but we do want to allow

2296

-+	 * fair distribution of slice time for a process doing back-to-back

2297

-+	 * seeks. So allow a little bit of time for him to submit a new rq.

2298

-+	 *

2299

-+	 * To prevent processes with (partly) seeky workloads from

2300

-+	 * being too ill-treated, grant them a small fraction of the

2301

-+	 * assigned budget before reducing the waiting time to

2302

-+	 * BFQ_MIN_TT. This happened to help reduce latency.

2303

-+	 */

2304

-+	sl = bfqd->bfq_slice_idle;

2305

-+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

2306

-+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

2307

-+	    bfqq->raising_coeff == 1)

2308

-+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2309

-+	else if (bfqq->raising_coeff > 1)

2310

-+		sl = sl * 3;

2311

-+	bfqd->last_idling_start = ktime_get();

2312

-+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2313

-+	bfq_log(bfqd, "arm idle: %u/%u ms",

2314

-+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2315

-+}

2316

-+

2317

-+/*

2318

-+ * Set the maximum time for the in-service queue to consume its

2319

-+ * budget. This prevents seeky processes from lowering the disk

2320

-+ * throughput (always guaranteed with a time slice scheme as in CFQ).

2321

-+ */

2322

-+static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2323

-+{

2324

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

2325

-+	unsigned int timeout_coeff;

2326

-+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

2327

-+		timeout_coeff = 1;

2328

-+	else

2329

-+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2330

-+

2331

-+	bfqd->last_budget_start = ktime_get();

2332

-+

2333

-+	bfq_clear_bfqq_budget_new(bfqq);

2334

-+	bfqq->budget_timeout = jiffies +

2335

-+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2336

-+

2337

-+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2338

-+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2339

-+		timeout_coeff));

2340

-+}

2341

-+

2342

-+/*

2343

-+ * Move request from internal lists to the request queue dispatch list.

2344

-+ */

2345

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2346

-+{

2347

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

2348

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2349

-+

2350

-+	bfq_remove_request(rq);

2351

-+	bfqq->dispatched++;

2352

-+	elv_dispatch_sort(q, rq);

2353

-+

2354

-+	if (bfq_bfqq_sync(bfqq))

2355

-+		bfqd->sync_flight++;

2356

-+}

2357

-+

2358

-+/*

2359

-+ * Return expired entry, or NULL to just start from scratch in rbtree.

2360

-+ */

2361

-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2362

-+{

2363

-+	struct request *rq = NULL;

2364

-+

2365

-+	if (bfq_bfqq_fifo_expire(bfqq))

2366

-+		return NULL;

2367

-+

2368

-+	bfq_mark_bfqq_fifo_expire(bfqq);

2369

-+

2370

-+	if (list_empty(&bfqq->fifo))

2371

-+		return NULL;

2372

-+

2373

-+	rq = rq_entry_fifo(bfqq->fifo.next);

2374

-+

2375

-+	if (time_before(jiffies, rq_fifo_time(rq)))

2376

-+		return NULL;

2377

-+

2378

-+	return rq;

2379

-+}

2380

-+

2381

-+/*

2382

-+ * Must be called with the queue_lock held.

2383

-+ */

2384

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

2385

-+{

2386

-+	int process_refs, io_refs;

2387

-+

2388

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2389

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2390

-+	BUG_ON(process_refs < 0);

2391

-+	return process_refs;

2392

-+}

2393

-+

2394

-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2395

-+{

2396

-+	int process_refs, new_process_refs;

2397

-+	struct bfq_queue *__bfqq;

2398

-+

2399

-+	/*

2400

-+	 * If there are no process references on the new_bfqq, then it is

2401

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2402

-+	 * may have dropped their last reference (not just their last process

2403

-+	 * reference).

2404

-+	 */

2405

-+	if (!bfqq_process_refs(new_bfqq))

2406

-+		return;

2407

-+

2408

-+	/* Avoid a circular list and skip interim queue merges. */

2409

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

2410

-+		if (__bfqq == bfqq)

2411

-+			return;

2412

-+		new_bfqq = __bfqq;

2413

-+	}

2414

-+

2415

-+	process_refs = bfqq_process_refs(bfqq);

2416

-+	new_process_refs = bfqq_process_refs(new_bfqq);

2417

-+	/*

2418

-+	 * If the process for the bfqq has gone away, there is no

2419

-+	 * sense in merging the queues.

2420

-+	 */

2421

-+	if (process_refs == 0 || new_process_refs == 0)

2422

-+		return;

2423

-+

2424

-+	/*

2425

-+	 * Merge in the direction of the lesser amount of work.

2426

-+	 */

2427

-+	if (new_process_refs >= process_refs) {

2428

-+		bfqq->new_bfqq = new_bfqq;

2429

-+		atomic_add(process_refs, &new_bfqq->ref);

2430

-+	} else {

2431

-+		new_bfqq->new_bfqq = bfqq;

2432

-+		atomic_add(new_process_refs, &bfqq->ref);

2433

-+	}

2434

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2435

-+		new_bfqq->pid);

2436

-+}

2437

-+

2438

-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2439

-+{

2440

-+	struct bfq_entity *entity = &bfqq->entity;

2441

-+	return entity->budget - entity->service;

2442

-+}

2443

-+

2444

-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2445

-+{

2446

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2447

-+

2448

-+	__bfq_bfqd_reset_in_service(bfqd);

2449

-+

2450

-+	/*

2451

-+	 * If this bfqq is shared between multiple processes, check

2452

-+	 * to make sure that those processes are still issuing I/Os

2453

-+	 * within the mean seek distance. If not, it may be time to

2454

-+	 * break the queues apart again.

2455

-+	 */

2456

-+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2457

-+		bfq_mark_bfqq_split_coop(bfqq);

2458

-+

2459

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2460

-+		/*

2461

-+		 * overloading budget_timeout field to store when

2462

-+		 * the queue remains with no backlog, used by

2463

-+		 * the weight-raising mechanism

2464

-+		 */

2465

-+		bfqq->budget_timeout = jiffies;

2466

-+		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2467

-+	} else {

2468

-+		bfq_activate_bfqq(bfqd, bfqq);

2469

-+		/*

2470

-+		 * Resort priority tree of potential close cooperators.

2471

-+		 */

2472

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

2473

-+	}

2474

-+}

2475

-+

2476

-+/**

2477

-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2478

-+ * @bfqd: device data.

2479

-+ * @bfqq: queue to update.

2480

-+ * @reason: reason for expiration.

2481

-+ *

2482

-+ * Handle the feedback on @bfqq budget.  See the body for detailed

2483

-+ * comments.

2484

-+ */

2485

-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2486

-+				     struct bfq_queue *bfqq,

2487

-+				     enum bfqq_expiration reason)

2488

-+{

2489

-+	struct request *next_rq;

2490

-+	unsigned long budget, min_budget;

2491

-+

2492

-+	budget = bfqq->max_budget;

2493

-+	min_budget = bfq_min_budget(bfqd);

2494

-+

2495

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2496

-+

2497

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2498

-+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2499

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2500

-+		budget, bfq_min_budget(bfqd));

2501

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2502

-+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2503

-+

2504

-+	if (bfq_bfqq_sync(bfqq)) {

2505

-+		switch (reason) {

2506

-+		/*

2507

-+		 * Caveat: in all the following cases we trade latency

2508

-+		 * for throughput.

2509

-+		 */

2510

-+		case BFQ_BFQQ_TOO_IDLE:

2511

-+			/*

2512

-+			 * This is the only case where we may reduce

2513

-+			 * the budget: if there is no requets of the

2514

-+			 * process still waiting for completion, then

2515

-+			 * we assume (tentatively) that the timer has

2516

-+			 * expired because the batch of requests of

2517

-+			 * the process could have been served with a

2518

-+			 * smaller budget.  Hence, betting that

2519

-+			 * process will behave in the same way when it

2520

-+			 * becomes backlogged again, we reduce its

2521

-+			 * next budget.  As long as we guess right,

2522

-+			 * this budget cut reduces the latency

2523

-+			 * experienced by the process.

2524

-+			 *

2525

-+			 * However, if there are still outstanding

2526

-+			 * requests, then the process may have not yet

2527

-+			 * issued its next request just because it is

2528

-+			 * still waiting for the completion of some of

2529

-+			 * the still oustanding ones.  So in this

2530

-+			 * subcase we do not reduce its budget, on the

2531

-+			 * contrary we increase it to possibly boost

2532

-+			 * the throughput, as discussed in the

2533

-+			 * comments to the BUDGET_TIMEOUT case.

2534

-+			 */

2535

-+			if (bfqq->dispatched > 0) /* still oustanding reqs */

2536

-+				budget = min(budget * 2, bfqd->bfq_max_budget);

2537

-+			else {

2538

-+				if (budget > 5 * min_budget)

2539

-+					budget -= 4 * min_budget;

2540

-+				else

2541

-+					budget = min_budget;

2542

-+			}

2543

-+			break;

2544

-+		case BFQ_BFQQ_BUDGET_TIMEOUT:

2545

-+			/*

2546

-+			 * We double the budget here because: 1) it

2547

-+			 * gives the chance to boost the throughput if

2548

-+			 * this is not a seeky process (which may have

2549

-+			 * bumped into this timeout because of, e.g.,

2550

-+			 * ZBR), 2) together with charge_full_budget

2551

-+			 * it helps give seeky processes higher

2552

-+			 * timestamps, and hence be served less

2553

-+			 * frequently.

2554

-+			 */

2555

-+			budget = min(budget * 2, bfqd->bfq_max_budget);

2556

-+			break;

2557

-+		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2558

-+			/*

2559

-+			 * The process still has backlog, and did not

2560

-+			 * let either the budget timeout or the disk

2561

-+			 * idling timeout expire. Hence it is not

2562

-+			 * seeky, has a short thinktime and may be

2563

-+			 * happy with a higher budget too. So

2564

-+			 * definitely increase the budget of this good

2565

-+			 * candidate to boost the disk throughput.

2566

-+			 */

2567

-+			budget = min(budget * 4, bfqd->bfq_max_budget);

2568

-+			break;

2569

-+		case BFQ_BFQQ_NO_MORE_REQUESTS:

2570

-+		       /*

2571

-+			* Leave the budget unchanged.

2572

-+			*/

2573

-+		default:

2574

-+			return;

2575

-+		}

2576

-+	} else /* async queue */

2577

-+	    /* async queues get always the maximum possible budget

2578

-+	     * (their ability to dispatch is limited by

2579

-+	     * @bfqd->bfq_max_budget_async_rq).

2580

-+	     */

2581

-+		budget = bfqd->bfq_max_budget;

2582

-+

2583

-+	bfqq->max_budget = budget;

2584

-+

2585

-+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2586

-+	    bfqq->max_budget > bfqd->bfq_max_budget)

2587

-+		bfqq->max_budget = bfqd->bfq_max_budget;

2588

-+

2589

-+	/*

2590

-+	 * Make sure that we have enough budget for the next request.

2591

-+	 * Since the finish time of the bfqq must be kept in sync with

2592

-+	 * the budget, be sure to call __bfq_bfqq_expire() after the

2593

-+	 * update.

2594

-+	 */

2595

-+	next_rq = bfqq->next_rq;

2596

-+	if (next_rq != NULL)

2597

-+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2598

-+					    bfq_serv_to_charge(next_rq, bfqq));

2599

-+	else

2600

-+		bfqq->entity.budget = bfqq->max_budget;

2601

-+

2602

-+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2603

-+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2604

-+			bfqq->entity.budget);

2605

-+}

2606

-+

2607

-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2608

-+{

2609

-+	unsigned long max_budget;

2610

-+

2611

-+	/*

2612

-+	 * The max_budget calculated when autotuning is equal to the

2613

-+	 * amount of sectors transfered in timeout_sync at the

2614

-+	 * estimated peak rate.

2615

-+	 */

2616

-+	max_budget = (unsigned long)(peak_rate * 1000 *

2617

-+				     timeout >> BFQ_RATE_SHIFT);

2618

-+

2619

-+	return max_budget;

2620

-+}

2621

-+

2622

-+/*

2623

-+ * In addition to updating the peak rate, checks whether the process

2624

-+ * is "slow", and returns 1 if so. This slow flag is used, in addition

2625

-+ * to the budget timeout, to reduce the amount of service provided to

2626

-+ * seeky processes, and hence reduce their chances to lower the

2627

-+ * throughput. See the code for more details.

2628

-+ */

2629

-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2630

-+				int compensate, enum bfqq_expiration reason)

2631

-+{

2632

-+	u64 bw, usecs, expected, timeout;

2633

-+	ktime_t delta;

2634

-+	int update = 0;

2635

-+

2636

-+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2637

-+		return 0;

2638

-+

2639

-+	if (compensate)

2640

-+		delta = bfqd->last_idling_start;

2641

-+	else

2642

-+		delta = ktime_get();

2643

-+	delta = ktime_sub(delta, bfqd->last_budget_start);

2644

-+	usecs = ktime_to_us(delta);

2645

-+

2646

-+	/* Don't trust short/unrealistic values. */

2647

-+	if (usecs < 100 || usecs >= LONG_MAX)

2648

-+		return 0;

2649

-+

2650

-+	/*

2651

-+	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2652

-+	 * value to store the peak rate, in sectors per usec in fixed

2653

-+	 * point math.  We do so to have enough precision in the estimate

2654

-+	 * and to avoid overflows.

2655

-+	 */

2656

-+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2657

-+	do_div(bw, (unsigned long)usecs);

2658

-+

2659

-+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2660

-+

2661

-+	/*

2662

-+	 * Use only long (> 20ms) intervals to filter out spikes for

2663

-+	 * the peak rate estimation.

2664

-+	 */

2665

-+	if (usecs > 20000) {

2666

-+		if (bw > bfqd->peak_rate ||

2667

-+		   (!BFQQ_SEEKY(bfqq) &&

2668

-+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2669

-+			bfq_log(bfqd, "measured bw =%llu", bw);

2670

-+			/*

2671

-+			 * To smooth oscillations use a low-pass filter with

2672

-+			 * alpha=7/8, i.e.,

2673

-+			 * new_rate = (7/8) * old_rate + (1/8) * bw

2674

-+			 */

2675

-+			do_div(bw, 8);

2676

-+			if (bw == 0)

2677

-+				return 0;

2678

-+			bfqd->peak_rate *= 7;

2679

-+			do_div(bfqd->peak_rate, 8);

2680

-+			bfqd->peak_rate += bw;

2681

-+			update = 1;

2682

-+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2683

-+		}

2684

-+

2685

-+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2686

-+

2687

-+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2688

-+			bfqd->peak_rate_samples++;

2689

-+

2690

-+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

2691

-+		    update && bfqd->bfq_user_max_budget == 0) {

2692

-+			bfqd->bfq_max_budget =

2693

-+				bfq_calc_max_budget(bfqd->peak_rate, timeout);

2694

-+			bfq_log(bfqd, "new max_budget=%lu",

2695

-+				bfqd->bfq_max_budget);

2696

-+		}

2697

-+	}

2698

-+

2699

-+	/*

2700

-+	 * If the process has been served for a too short time

2701

-+	 * interval to let its possible sequential accesses prevail on

2702

-+	 * the initial seek time needed to move the disk head on the

2703

-+	 * first sector it requested, then give the process a chance

2704

-+	 * and for the moment return false.

2705

-+	 */

2706

-+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

2707

-+		return 0;

2708

-+

2709

-+	/*

2710

-+	 * A process is considered ``slow'' (i.e., seeky, so that we

2711

-+	 * cannot treat it fairly in the service domain, as it would

2712

-+	 * slow down too much the other processes) if, when a slice

2713

-+	 * ends for whatever reason, it has received service at a

2714

-+	 * rate that would not be high enough to complete the budget

2715

-+	 * before the budget timeout expiration.

2716

-+	 */

2717

-+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

2718

-+

2719

-+	/*

2720

-+	 * Caveat: processes doing IO in the slower disk zones will

2721

-+	 * tend to be slow(er) even if not seeky. And the estimated

2722

-+	 * peak rate will actually be an average over the disk

2723

-+	 * surface. Hence, to not be too harsh with unlucky processes,

2724

-+	 * we keep a budget/3 margin of safety before declaring a

2725

-+	 * process slow.

2726

-+	 */

2727

-+	return expected > (4 * bfqq->entity.budget) / 3;

2728

-+}

2729

-+

2730

-+/*

2731

-+ * To be deemed as soft real-time, an application must meet two requirements.

2732

-+ * The first is that the application must not require an average bandwidth

2733

-+ * higher than the approximate bandwidth required to playback or record a

2734

-+ * compressed high-definition video.

2735

-+ * The next function is invoked on the completion of the last request of a

2736

-+ * batch, to compute the next-start time instant, soft_rt_next_start, such

2737

-+ * that, if the next request of the application does not arrive before

2738

-+ * soft_rt_next_start, then the above requirement on the bandwidth is met.

2739

-+ *

2740

-+ * The second requirement is that the request pattern of the application is

2741

-+ * isochronous, i.e., that, after issuing a request or a batch of requests, the

2742

-+ * application stops for a while, then issues a new batch, and so on. For this

2743

-+ * reason the next function is invoked to compute soft_rt_next_start only for

2744

-+ * applications that meet this requirement, whereas soft_rt_next_start is set

2745

-+ * to infinity for applications that do not.

2746

-+ *

2747

-+ * Unfortunately, even a greedy application may happen to behave in an

2748

-+ * isochronous way if several processes are competing for the CPUs. In fact,

2749

-+ * in this scenario the application stops issuing requests while the CPUs are

2750

-+ * busy serving other processes, then restarts, then stops again for a while,

2751

-+ * and so on. In addition, if the disk achieves a low enough throughput with

2752

-+ * the request pattern issued by the application, then the above bandwidth

2753

-+ * requirement may happen to be met too. To prevent such a greedy application

2754

-+ * to be deemed as soft real-time, a further rule is used in the computation

2755

-+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

2756

-+ * time plus the maximum time for which the arrival of a request is waited

2757

-+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

2758

-+ * filters out greedy applications, as the latter issue instead their next

2759

-+ * request as soon as possible after the last one has been completed (in

2760

-+ * contrast, when a batch of requests is completed, a soft real-time

2761

-+ * application spends some time processing data).

2762

-+ *

2763

-+ * Actually, the last filter may easily generate false positives if: only

2764

-+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

2765

-+ * both the following two cases occur:

2766

-+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

2767

-+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

2768

-+ *    HZ=100.

2769

-+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

2770

-+ *    for a while, then suddenly 'jump' by several units to recover the lost

2771

-+ *    increments. This seems to happen, e.g., inside virtual machines.

2772

-+ * To address this issue, we do not use as a reference time interval just

2773

-+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

2774

-+ * particular we add the minimum number of jiffies for which the filter seems

2775

-+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

2776

-+ */

2777

-+static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

2778

-+					     struct bfq_queue *bfqq)

2779

-+{

2780

-+	return max(bfqq->last_idle_bklogged +

2781

-+		   HZ * bfqq->service_from_backlogged /

2782

-+		   bfqd->bfq_raising_max_softrt_rate,

2783

-+		   (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);

2784

-+}

2785

-+

2786

-+/**

2787

-+ * bfq_bfqq_expire - expire a queue.

2788

-+ * @bfqd: device owning the queue.

2789

-+ * @bfqq: the queue to expire.

2790

-+ * @compensate: if true, compensate for the time spent idling.

2791

-+ * @reason: the reason causing the expiration.

2792

-+ *

2793

-+ *

2794

-+ * If the process associated to the queue is slow (i.e., seeky), or in

2795

-+ * case of budget timeout, or, finally, if it is async, we

2796

-+ * artificially charge it an entire budget (independently of the

2797

-+ * actual service it received). As a consequence, the queue will get

2798

-+ * higher timestamps than the correct ones upon reactivation, and

2799

-+ * hence it will be rescheduled as if it had received more service

2800

-+ * than what it actually received. In the end, this class of processes

2801

-+ * will receive less service in proportion to how slowly they consume

2802

-+ * their budgets (and hence how seriously they tend to lower the

2803

-+ * throughput).

2804

-+ *

2805

-+ * In contrast, when a queue expires because it has been idling for

2806

-+ * too much or because it exhausted its budget, we do not touch the

2807

-+ * amount of service it has received. Hence when the queue will be

2808

-+ * reactivated and its timestamps updated, the latter will be in sync

2809

-+ * with the actual service received by the queue until expiration.

2810

-+ *

2811

-+ * Charging a full budget to the first type of queues and the exact

2812

-+ * service to the others has the effect of using the WF2Q+ policy to

2813

-+ * schedule the former on a timeslice basis, without violating the

2814

-+ * service domain guarantees of the latter.

2815

-+ */

2816

-+static void bfq_bfqq_expire(struct bfq_data *bfqd,

2817

-+			    struct bfq_queue *bfqq,

2818

-+			    int compensate,

2819

-+			    enum bfqq_expiration reason)

2820

-+{

2821

-+	int slow;

2822

-+	BUG_ON(bfqq != bfqd->in_service_queue);

2823

-+

2824

-+	/* Update disk peak rate for autotuning and check whether the

2825

-+	 * process is slow (see bfq_update_peak_rate).

2826

-+	 */

2827

-+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

2828

-+

2829

-+	/*

2830

-+	 * As above explained, 'punish' slow (i.e., seeky), timed-out

2831

-+	 * and async queues, to favor sequential sync workloads.

2832

-+	 *

2833

-+	 * Processes doing IO in the slower disk zones will tend to be

2834

-+	 * slow(er) even if not seeky. Hence, since the estimated peak

2835

-+	 * rate is actually an average over the disk surface, these

2836

-+	 * processes may timeout just for bad luck. To avoid punishing

2837

-+	 * them we do not charge a full budget to a process that

2838

-+	 * succeeded in consuming at least 2/3 of its budget.

2839

-+	 */

2840

-+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

2841

-+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

2842

-+		bfq_bfqq_charge_full_budget(bfqq);

2843

-+

2844

-+	bfqq->service_from_backlogged += bfqq->entity.service;

2845

-+

2846

-+	if (bfqd->low_latency && bfqq->raising_coeff == 1)

2847

-+		bfqq->last_rais_start_finish = jiffies;

2848

-+

2849

-+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

2850

-+		if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&

2851

-+		    reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {

2852

-+			/*

2853

-+			 * If we get here, then the request pattern is

2854

-+			 * isochronous (see the comments to the function

2855

-+			 * bfq_bfqq_softrt_next_start()). However, if the

2856

-+			 * queue still has in-flight requests, then it is

2857

-+			 * better to postpone the computation of next_start

2858

-+			 * to the next request completion. In fact, if we

2859

-+			 * computed it now, then the application might pass

2860

-+			 * the greedy-application filter improperly, because

2861

-+			 * the arrival of its next request may  happen to be

2862

-+			 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

2863

-+			 * not because the application is truly soft real-

2864

-+			 * time, but just because the application is currently

2865

-+			 * waiting for the completion of some request before

2866

-+			 * issuing, as quickly as possible, its next request.

2867

-+			 */

2868

-+			if (bfqq->dispatched > 0) {

2869

-+				bfqq->soft_rt_next_start = -1;

2870

-+				bfq_mark_bfqq_softrt_update(bfqq);

2871

-+			} else

2872

-+				bfqq->soft_rt_next_start =

2873

-+					bfq_bfqq_softrt_next_start(bfqd, bfqq);

2874

-+		} else

2875

-+			bfqq->soft_rt_next_start = -1; /* infinity */

2876

-+	}

2877

-+

2878

-+	bfq_log_bfqq(bfqd, bfqq,

2879

-+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

2880

-+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

2881

-+

2882

-+	/* Increase, decrease or leave budget unchanged according to reason */

2883

-+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

2884

-+	__bfq_bfqq_expire(bfqd, bfqq);

2885

-+}

2886

-+

2887

-+/*

2888

-+ * Budget timeout is not implemented through a dedicated timer, but

2889

-+ * just checked on request arrivals and completions, as well as on

2890

-+ * idle timer expirations.

2891

-+ */

2892

-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

2893

-+{

2894

-+	if (bfq_bfqq_budget_new(bfqq))

2895

-+		return 0;

2896

-+

2897

-+	if (time_before(jiffies, bfqq->budget_timeout))

2898

-+		return 0;

2899

-+

2900

-+	return 1;

2901

-+}

2902

-+

2903

-+/*

2904

-+ * If we expire a queue that is waiting for the arrival of a new

2905

-+ * request, we may prevent the fictitious timestamp backshifting that

2906

-+ * allows the guarantees of the queue to be preserved (see [1] for

2907

-+ * this tricky aspect). Hence we return true only if this condition

2908

-+ * does not hold, or if the queue is slow enough to deserve only to be

2909

-+ * kicked off for preserving a high throughput.

2910

-+*/

2911

-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

2912

-+{

2913

-+	bfq_log_bfqq(bfqq->bfqd, bfqq,

2914

-+		"may_budget_timeout: wr %d left %d timeout %d",

2915

-+		bfq_bfqq_wait_request(bfqq),

2916

-+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

2917

-+		bfq_bfqq_budget_timeout(bfqq));

2918

-+

2919

-+	return (!bfq_bfqq_wait_request(bfqq) ||

2920

-+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

2921

-+		&&

2922

-+		bfq_bfqq_budget_timeout(bfqq);

2923

-+}

2924

-+

2925

-+/*

2926

-+ * For weight-raised queues issuing sync requests, idling is always performed,

2927

-+ * as this is instrumental in guaranteeing a high fraction of the throughput

2928

-+ * to these queues, and hence in guaranteeing a lower latency for their

2929

-+ * requests. See [1] for details.

2930

-+ *

2931

-+ * For non-weight-raised queues, idling is instead disabled if the device is

2932

-+ * NCQ-enabled and non-rotational, as this boosts the throughput on such

2933

-+ * devices.

2934

-+ */

2935

-+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

2936

-+{

2937

-+	struct bfq_data *bfqd = bfqq->bfqd;

2938

-+

2939

-+	return bfq_bfqq_sync(bfqq) && (

2940

-+		bfqq->raising_coeff > 1 ||

2941

-+		(bfq_bfqq_idle_window(bfqq) &&

2942

-+		 !(bfqd->hw_tag &&

2943

-+		   (blk_queue_nonrot(bfqd->queue) ||

2944

-+		 /*

2945

-+		  * If there are weight-raised busy queues, then do not idle

2946

-+		  * the disk for a sync non-weight-raised queue, and hence

2947

-+		  * expire the queue immediately if empty. Combined with the

2948

-+		  * timestamping rules of BFQ (see [1] for details), this

2949

-+		  * causes sync non-weight-raised queues to get a lower

2950

-+		  * fraction of the disk throughput, and hence reduces the rate

2951

-+		  * at which the processes associated to these queues ask for

2952

-+		  * requests from the request pool.

2953

-+		  *

2954

-+		  * This is beneficial for weight-raised processes, when the

2955

-+		  * system operates in request-pool saturation conditions

2956

-+		  * (e.g., in the presence of write hogs). In fact, if

2957

-+		  * non-weight-raised processes ask for requests at a lower

2958

-+		  * rate, then weight-raised processes have a higher

2959

-+		  * probability to get a request from the pool immediately

2960

-+		  * (or at least soon) when they need one. Hence they have a

2961

-+		  * higher probability to actually get a fraction of the disk

2962

-+		  * throughput proportional to their high weight. This is

2963

-+		  * especially true with NCQ-enabled drives, which enqueue

2964

-+		  * several requests in advance and further reorder

2965

-+		  * internally-queued requests.

2966

-+		  *

2967

-+		  * Mistreating non-weight-raised queues in the above-described

2968

-+		  * way, when there are busy weight-raised queues, seems to

2969

-+		  * mitigate starvation problems in the presence of heavy write

2970

-+		  * workloads and NCQ, and hence to guarantee a higher

2971

-+		  * application and system responsiveness in these hostile

2972

-+		  * scenarios.

2973

-+		  */

2974

-+		    bfqd->raised_busy_queues > 0)

2975

-+		  )

2976

-+		)

2977

-+	);

2978

-+}

2979

-+

2980

-+/*

2981

-+ * If the in-service queue is empty, but it is sync and either of the following

2982

-+ * conditions holds, then: 1) the queue must remain in service and cannot be

2983

-+ * expired, and 2) the disk must be idled to wait for the possible arrival

2984

-+ * of a new request for the queue. The conditions are:

2985

-+ * - the device is rotational and not performing NCQ, and the queue has its

2986

-+ *   idle window set (in this case, waiting for a new request for the queue

2987

-+ *   is likely to boost the disk throughput);

2988

-+ * - the queue is weight-raised (waiting for the request is necessary to

2989

-+ *   provide the queue with fairness and latency guarantees, see [1] for

2990

-+ *   details).

2991

-+ */

2992

-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

2993

-+{

2994

-+	struct bfq_data *bfqd = bfqq->bfqd;

2995

-+

2996

-+	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

2997

-+		bfq_bfqq_must_not_expire(bfqq) &&

2998

-+		!bfq_queue_nonrot_noidle(bfqd, bfqq));

2999

-+}

3000

-+

3001

-+/*

3002

-+ * Select a queue for service.  If we have a current queue in service,

3003

-+ * check whether to continue servicing it, or retrieve and set a new one.

3004

-+ */

3005

-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3006

-+{

3007

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

3008

-+	struct request *next_rq;

3009

-+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3010

-+

3011

-+	bfqq = bfqd->in_service_queue;

3012

-+	if (bfqq == NULL)

3013

-+		goto new_queue;

3014

-+

3015

-+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3016

-+

3017

-+	/*

3018

-+         * If another queue has a request waiting within our mean seek

3019

-+         * distance, let it run. The expire code will check for close

3020

-+         * cooperators and put the close queue at the front of the

3021

-+         * service tree. If possible, merge the expiring queue with the

3022

-+         * new bfqq.

3023

-+         */

3024

-+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3025

-+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3026

-+                bfq_setup_merge(bfqq, new_bfqq);

3027

-+

3028

-+	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3029

-+	    !timer_pending(&bfqd->idle_slice_timer) &&

3030

-+	    !bfq_bfqq_must_idle(bfqq))

3031

-+		goto expire;

3032

-+

3033

-+	next_rq = bfqq->next_rq;

3034

-+	/*

3035

-+	 * If bfqq has requests queued and it has enough budget left to

3036

-+	 * serve them, keep the queue, otherwise expire it.

3037

-+	 */

3038

-+	if (next_rq != NULL) {

3039

-+		if (bfq_serv_to_charge(next_rq, bfqq) >

3040

-+			bfq_bfqq_budget_left(bfqq)) {

3041

-+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3042

-+			goto expire;

3043

-+		} else {

3044

-+			/*

3045

-+			 * The idle timer may be pending because we may not

3046

-+			 * disable disk idling even when a new request arrives

3047

-+			 */

3048

-+			if (timer_pending(&bfqd->idle_slice_timer)) {

3049

-+				/*

3050

-+				 * If we get here: 1) at least a new request

3051

-+				 * has arrived but we have not disabled the

3052

-+				 * timer because the request was too small,

3053

-+				 * 2) then the block layer has unplugged the

3054

-+				 * device, causing the dispatch to be invoked.

3055

-+				 *

3056

-+				 * Since the device is unplugged, now the

3057

-+				 * requests are probably large enough to

3058

-+				 * provide a reasonable throughput.

3059

-+				 * So we disable idling.

3060

-+				 */

3061

-+				bfq_clear_bfqq_wait_request(bfqq);

3062

-+				del_timer(&bfqd->idle_slice_timer);

3063

-+			}

3064

-+			if (new_bfqq == NULL)

3065

-+				goto keep_queue;

3066

-+			else

3067

-+				goto expire;

3068

-+		}

3069

-+	}

3070

-+

3071

-+	/*

3072

-+	 * No requests pending.  If the in-service queue has no cooperator and

3073

-+	 * still has requests in flight (possibly waiting for a completion)

3074

-+	 * or is idling for a new request, then keep it.

3075

-+	 */

3076

-+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3077

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3078

-+		bfqq = NULL;

3079

-+		goto keep_queue;

3080

-+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3081

-+		/*

3082

-+		 * Expiring the queue because there is a close cooperator,

3083

-+		 * cancel timer.

3084

-+		 */

3085

-+		bfq_clear_bfqq_wait_request(bfqq);

3086

-+		del_timer(&bfqd->idle_slice_timer);

3087

-+	}

3088

-+

3089

-+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3090

-+expire:

3091

-+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3092

-+new_queue:

3093

-+	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3094

-+	bfq_log(bfqd, "select_queue: new queue %d returned",

3095

-+		bfqq != NULL ? bfqq->pid : 0);

3096

-+keep_queue:

3097

-+	return bfqq;

3098

-+}

3099

-+

3100

-+static void bfq_update_raising_data(struct bfq_data *bfqd,

3101

-+				    struct bfq_queue *bfqq)

3102

-+{

3103

-+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

3104

-+		struct bfq_entity *entity = &bfqq->entity;

3105

-+

3106

-+		bfq_log_bfqq(bfqd, bfqq,

3107

-+			"raising period dur %u/%u msec, "

3108

-+			"old raising coeff %u, w %d(%d)",

3109

-+			jiffies_to_msecs(jiffies -

3110

-+				bfqq->last_rais_start_finish),

3111

-+			jiffies_to_msecs(bfqq->raising_cur_max_time),

3112

-+			bfqq->raising_coeff,

3113

-+			bfqq->entity.weight, bfqq->entity.orig_weight);

3114

-+

3115

-+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3116

-+			entity->orig_weight * bfqq->raising_coeff);

3117

-+		if (entity->ioprio_changed)

3118

-+			bfq_log_bfqq(bfqd, bfqq,

3119

-+			"WARN: pending prio change");

3120

-+		/*

3121

-+		 * If too much time has elapsed from the beginning

3122

-+		 * of this weight-raising, stop it.

3123

-+		 */

3124

-+		if (jiffies - bfqq->last_rais_start_finish >

3125

-+			bfqq->raising_cur_max_time) {

3126

-+			bfqq->last_rais_start_finish = jiffies;

3127

-+			bfq_log_bfqq(bfqd, bfqq,

3128

-+				     "wrais ending at %llu msec,"

3129

-+				     "rais_max_time %u",

3130

-+				     bfqq->last_rais_start_finish,

3131

-+				     jiffies_to_msecs(bfqq->

3132

-+					raising_cur_max_time));

3133

-+			bfq_bfqq_end_raising(bfqq);

3134

-+			__bfq_entity_update_weight_prio(

3135

-+				bfq_entity_service_tree(entity),

3136

-+				entity);

3137

-+		}

3138

-+	}

3139

-+}

3140

-+

3141

-+/*

3142

-+ * Dispatch one request from bfqq, moving it to the request queue

3143

-+ * dispatch list.

3144

-+ */

3145

-+static int bfq_dispatch_request(struct bfq_data *bfqd,

3146

-+				struct bfq_queue *bfqq)

3147

-+{

3148

-+	int dispatched = 0;

3149

-+	struct request *rq;

3150

-+	unsigned long service_to_charge;

3151

-+

3152

-+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3153

-+

3154

-+	/* Follow expired path, else get first next available. */

3155

-+	rq = bfq_check_fifo(bfqq);

3156

-+	if (rq == NULL)

3157

-+		rq = bfqq->next_rq;

3158

-+	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3159

-+

3160

-+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3161

-+		/*

3162

-+		 * This may happen if the next rq is chosen

3163

-+		 * in fifo order instead of sector order.

3164

-+		 * The budget is properly dimensioned

3165

-+		 * to be always sufficient to serve the next request

3166

-+		 * only if it is chosen in sector order. The reason is

3167

-+		 * that it would be quite inefficient and little useful

3168

-+		 * to always make sure that the budget is large enough

3169

-+		 * to serve even the possible next rq in fifo order.

3170

-+		 * In fact, requests are seldom served in fifo order.

3171

-+		 *

3172

-+		 * Expire the queue for budget exhaustion, and

3173

-+		 * make sure that the next act_budget is enough

3174

-+		 * to serve the next request, even if it comes

3175

-+		 * from the fifo expired path.

3176

-+		 */

3177

-+		bfqq->next_rq = rq;

3178

-+		/*

3179

-+		 * Since this dispatch is failed, make sure that

3180

-+		 * a new one will be performed

3181

-+		 */

3182

-+		if (!bfqd->rq_in_driver)

3183

-+			bfq_schedule_dispatch(bfqd);

3184

-+		goto expire;

3185

-+	}

3186

-+

3187

-+	/* Finally, insert request into driver dispatch list. */

3188

-+	bfq_bfqq_served(bfqq, service_to_charge);

3189

-+	bfq_dispatch_insert(bfqd->queue, rq);

3190

-+

3191

-+	bfq_update_raising_data(bfqd, bfqq);

3192

-+

3193

-+	bfq_log_bfqq(bfqd, bfqq,

3194

-+			"dispatched %u sec req (%llu), budg left %lu",

3195

-+			blk_rq_sectors(rq),

3196

-+			(long long unsigned)blk_rq_pos(rq),

3197

-+			bfq_bfqq_budget_left(bfqq));

3198

-+

3199

-+	dispatched++;

3200

-+

3201

-+	if (bfqd->in_service_bic == NULL) {

3202

-+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3203

-+		bfqd->in_service_bic = RQ_BIC(rq);

3204

-+	}

3205

-+

3206

-+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3207

-+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3208

-+	    bfq_class_idle(bfqq)))

3209

-+		goto expire;

3210

-+

3211

-+	return dispatched;

3212

-+

3213

-+expire:

3214

-+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3215

-+	return dispatched;

3216

-+}

3217

-+

3218

-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3219

-+{

3220

-+	int dispatched = 0;

3221

-+

3222

-+	while (bfqq->next_rq != NULL) {

3223

-+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3224

-+		dispatched++;

3225

-+	}

3226

-+

3227

-+	BUG_ON(!list_empty(&bfqq->fifo));

3228

-+	return dispatched;

3229

-+}

3230

-+

3231

-+/*

3232

-+ * Drain our current requests.  Used for barriers and when switching

3233

-+ * io schedulers on-the-fly.

3234

-+ */

3235

-+static int bfq_forced_dispatch(struct bfq_data *bfqd)

3236

-+{

3237

-+	struct bfq_queue *bfqq, *n;

3238

-+	struct bfq_service_tree *st;

3239

-+	int dispatched = 0;

3240

-+

3241

-+	bfqq = bfqd->in_service_queue;

3242

-+	if (bfqq != NULL)

3243

-+		__bfq_bfqq_expire(bfqd, bfqq);

3244

-+

3245

-+	/*

3246

-+	 * Loop through classes, and be careful to leave the scheduler

3247

-+	 * in a consistent state, as feedback mechanisms and vtime

3248

-+	 * updates cannot be disabled during the process.

3249

-+	 */

3250

-+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3251

-+		st = bfq_entity_service_tree(&bfqq->entity);

3252

-+

3253

-+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3254

-+		bfqq->max_budget = bfq_max_budget(bfqd);

3255

-+

3256

-+		bfq_forget_idle(st);

3257

-+	}

3258

-+

3259

-+	BUG_ON(bfqd->busy_queues != 0);

3260

-+

3261

-+	return dispatched;

3262

-+}

3263

-+

3264

-+static int bfq_dispatch_requests(struct request_queue *q, int force)

3265

-+{

3266

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3267

-+	struct bfq_queue *bfqq;

3268

-+	int max_dispatch;

3269

-+

3270

-+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3271

-+	if (bfqd->busy_queues == 0)

3272

-+		return 0;

3273

-+

3274

-+	if (unlikely(force))

3275

-+		return bfq_forced_dispatch(bfqd);

3276

-+

3277

-+	bfqq = bfq_select_queue(bfqd);

3278

-+	if (bfqq == NULL)

3279

-+		return 0;

3280

-+

3281

-+	max_dispatch = bfqd->bfq_quantum;

3282

-+	if (bfq_class_idle(bfqq))

3283

-+		max_dispatch = 1;

3284

-+

3285

-+	if (!bfq_bfqq_sync(bfqq))

3286

-+		max_dispatch = bfqd->bfq_max_budget_async_rq;

3287

-+

3288

-+	if (bfqq->dispatched >= max_dispatch) {

3289

-+		if (bfqd->busy_queues > 1)

3290

-+			return 0;

3291

-+		if (bfqq->dispatched >= 4 * max_dispatch)

3292

-+			return 0;

3293

-+	}

3294

-+

3295

-+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3296

-+		return 0;

3297

-+

3298

-+	bfq_clear_bfqq_wait_request(bfqq);

3299

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3300

-+

3301

-+	if (!bfq_dispatch_request(bfqd, bfqq))

3302

-+		return 0;

3303

-+

3304

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

3305

-+			bfqq->pid, max_dispatch);

3306

-+

3307

-+	return 1;

3308

-+}

3309

-+

3310

-+/*

3311

-+ * Task holds one reference to the queue, dropped when task exits.  Each rq

3312

-+ * in-flight on this queue also holds a reference, dropped when rq is freed.

3313

-+ *

3314

-+ * Queue lock must be held here.

3315

-+ */

3316

-+static void bfq_put_queue(struct bfq_queue *bfqq)

3317

-+{

3318

-+	struct bfq_data *bfqd = bfqq->bfqd;

3319

-+

3320

-+	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3321

-+

3322

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3323

-+		     atomic_read(&bfqq->ref));

3324

-+	if (!atomic_dec_and_test(&bfqq->ref))

3325

-+		return;

3326

-+

3327

-+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3328

-+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3329

-+	BUG_ON(bfqq->entity.tree != NULL);

3330

-+	BUG_ON(bfq_bfqq_busy(bfqq));

3331

-+	BUG_ON(bfqd->in_service_queue == bfqq);

3332

-+

3333

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3334

-+

3335

-+	kmem_cache_free(bfq_pool, bfqq);

3336

-+}

3337

-+

3338

-+static void bfq_put_cooperator(struct bfq_queue *bfqq)

3339

-+{

3340

-+	struct bfq_queue *__bfqq, *next;

3341

-+

3342

-+	/*

3343

-+	 * If this queue was scheduled to merge with another queue, be

3344

-+	 * sure to drop the reference taken on that queue (and others in

3345

-+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3346

-+	 */

3347

-+	__bfqq = bfqq->new_bfqq;

3348

-+	while (__bfqq) {

3349

-+		if (__bfqq == bfqq) {

3350

-+			WARN(1, "bfqq->new_bfqq loop detected.\n");

3351

-+			break;

3352

-+		}

3353

-+		next = __bfqq->new_bfqq;

3354

-+		bfq_put_queue(__bfqq);

3355

-+		__bfqq = next;

3356

-+	}

3357

-+}

3358

-+

3359

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3360

-+{

3361

-+	if (bfqq == bfqd->in_service_queue) {

3362

-+		__bfq_bfqq_expire(bfqd, bfqq);

3363

-+		bfq_schedule_dispatch(bfqd);

3364

-+	}

3365

-+

3366

-+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3367

-+		     atomic_read(&bfqq->ref));

3368

-+

3369

-+	bfq_put_cooperator(bfqq);

3370

-+

3371

-+	bfq_put_queue(bfqq);

3372

-+}

3373

-+

3374

-+static void bfq_init_icq(struct io_cq *icq)

3375

-+{

3376

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3377

-+

3378

-+	bic->ttime.last_end_request = jiffies;

3379

-+}

3380

-+

3381

-+static void bfq_exit_icq(struct io_cq *icq)

3382

-+{

3383

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3384

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

3385

-+

3386

-+	if (bic->bfqq[BLK_RW_ASYNC]) {

3387

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3388

-+		bic->bfqq[BLK_RW_ASYNC] = NULL;

3389

-+	}

3390

-+

3391

-+	if (bic->bfqq[BLK_RW_SYNC]) {

3392

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3393

-+		bic->bfqq[BLK_RW_SYNC] = NULL;

3394

-+	}

3395

-+}

3396

-+

3397

-+/*

3398

-+ * Update the entity prio values; note that the new values will not

3399

-+ * be used until the next (re)activation.

3400

-+ */

3401

-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3402

-+{

3403

-+	struct task_struct *tsk = current;

3404

-+	int ioprio_class;

3405

-+

3406

-+	if (!bfq_bfqq_prio_changed(bfqq))

3407

-+		return;

3408

-+

3409

-+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3410

-+	switch (ioprio_class) {

3411

-+	default:

3412

-+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3413

-+			"bfq: bad prio %x\n", ioprio_class);

3414

-+	case IOPRIO_CLASS_NONE:

3415

-+		/*

3416

-+		 * No prio set, inherit CPU scheduling settings.

3417

-+		 */

3418

-+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3419

-+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3420

-+		break;

3421

-+	case IOPRIO_CLASS_RT:

3422

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3423

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3424

-+		break;

3425

-+	case IOPRIO_CLASS_BE:

3426

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3427

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3428

-+		break;

3429

-+	case IOPRIO_CLASS_IDLE:

3430

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3431

-+		bfqq->entity.new_ioprio = 7;

3432

-+		bfq_clear_bfqq_idle_window(bfqq);

3433

-+		break;

3434

-+	}

3435

-+

3436

-+	bfqq->entity.ioprio_changed = 1;

3437

-+

3438

-+	/*

3439

-+	 * Keep track of original prio settings in case we have to temporarily

3440

-+	 * elevate the priority of this queue.

3441

-+	 */

3442

-+	bfqq->org_ioprio = bfqq->entity.new_ioprio;

3443

-+	bfq_clear_bfqq_prio_changed(bfqq);

3444

-+}

3445

-+

3446

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3447

-+{

3448

-+	struct bfq_data *bfqd;

3449

-+	struct bfq_queue *bfqq, *new_bfqq;

3450

-+	struct bfq_group *bfqg;

3451

-+	unsigned long uninitialized_var(flags);

3452

-+	int ioprio = bic->icq.ioc->ioprio;

3453

-+

3454

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3455

-+				   &flags);

3456

-+	/*

3457

-+	 * This condition may trigger on a newly created bic, be sure to drop

3458

-+	 * the lock before returning.

3459

-+	 */

3460

-+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3461

-+		goto out;

3462

-+

3463

-+	bfqq = bic->bfqq[BLK_RW_ASYNC];

3464

-+	if (bfqq != NULL) {

3465

-+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3466

-+				    sched_data);

3467

-+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3468

-+					 GFP_ATOMIC);

3469

-+		if (new_bfqq != NULL) {

3470

-+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3471

-+			bfq_log_bfqq(bfqd, bfqq,

3472

-+				     "changed_ioprio: bfqq %p %d",

3473

-+				     bfqq, atomic_read(&bfqq->ref));

3474

-+			bfq_put_queue(bfqq);

3475

-+		}

3476

-+	}

3477

-+

3478

-+	bfqq = bic->bfqq[BLK_RW_SYNC];

3479

-+	if (bfqq != NULL)

3480

-+		bfq_mark_bfqq_prio_changed(bfqq);

3481

-+

3482

-+	bic->ioprio = ioprio;

3483

-+

3484

-+out:

3485

-+	bfq_put_bfqd_unlock(bfqd, &flags);

3486

-+}

3487

-+

3488

-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3489

-+			  pid_t pid, int is_sync)

3490

-+{

3491

-+	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3492

-+	INIT_LIST_HEAD(&bfqq->fifo);

3493

-+

3494

-+	atomic_set(&bfqq->ref, 0);

3495

-+	bfqq->bfqd = bfqd;

3496

-+

3497

-+	bfq_mark_bfqq_prio_changed(bfqq);

3498

-+

3499

-+	if (is_sync) {

3500

-+		if (!bfq_class_idle(bfqq))

3501

-+			bfq_mark_bfqq_idle_window(bfqq);

3502

-+		bfq_mark_bfqq_sync(bfqq);

3503

-+	}

3504

-+

3505

-+	/* Tentative initial value to trade off between thr and lat */

3506

-+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3507

-+	bfqq->pid = pid;

3508

-+

3509

-+	bfqq->raising_coeff = 1;

3510

-+	bfqq->last_rais_start_finish = 0;

3511

-+	bfqq->soft_rt_next_start = -1;

3512

-+}

3513

-+

3514

-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

3515

-+					      struct bfq_group *bfqg,

3516

-+					      int is_sync,

3517

-+					      struct bfq_io_cq *bic,

3518

-+					      gfp_t gfp_mask)

3519

-+{

3520

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

3521

-+

3522

-+retry:

3523

-+	/* bic always exists here */

3524

-+	bfqq = bic_to_bfqq(bic, is_sync);

3525

-+

3526

-+	/*

3527

-+	 * Always try a new alloc if we fall back to the OOM bfqq

3528

-+	 * originally, since it should just be a temporary situation.

3529

-+	 */

3530

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3531

-+		bfqq = NULL;

3532

-+		if (new_bfqq != NULL) {

3533

-+			bfqq = new_bfqq;

3534

-+			new_bfqq = NULL;

3535

-+		} else if (gfp_mask & __GFP_WAIT) {

3536

-+			spin_unlock_irq(bfqd->queue->queue_lock);

3537

-+			new_bfqq = kmem_cache_alloc_node(bfq_pool,

3538

-+					gfp_mask | __GFP_ZERO,

3539

-+					bfqd->queue->node);

3540

-+			spin_lock_irq(bfqd->queue->queue_lock);

3541

-+			if (new_bfqq != NULL)

3542

-+				goto retry;

3543

-+		} else {

3544

-+			bfqq = kmem_cache_alloc_node(bfq_pool,

3545

-+					gfp_mask | __GFP_ZERO,

3546

-+					bfqd->queue->node);

3547

-+		}

3548

-+

3549

-+		if (bfqq != NULL) {

3550

-+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

3551

-+			bfq_log_bfqq(bfqd, bfqq, "allocated");

3552

-+		} else {

3553

-+			bfqq = &bfqd->oom_bfqq;

3554

-+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

3555

-+		}

3556

-+

3557

-+		bfq_init_prio_data(bfqq, bic);

3558

-+		bfq_init_entity(&bfqq->entity, bfqg);

3559

-+	}

3560

-+

3561

-+	if (new_bfqq != NULL)

3562

-+		kmem_cache_free(bfq_pool, new_bfqq);

3563

-+

3564

-+	return bfqq;

3565

-+}

3566

-+

3567

-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

3568

-+					       struct bfq_group *bfqg,

3569

-+					       int ioprio_class, int ioprio)

3570

-+{

3571

-+	switch (ioprio_class) {

3572

-+	case IOPRIO_CLASS_RT:

3573

-+		return &bfqg->async_bfqq[0][ioprio];

3574

-+	case IOPRIO_CLASS_NONE:

3575

-+		ioprio = IOPRIO_NORM;

3576

-+		/* fall through */

3577

-+	case IOPRIO_CLASS_BE:

3578

-+		return &bfqg->async_bfqq[1][ioprio];

3579

-+	case IOPRIO_CLASS_IDLE:

3580

-+		return &bfqg->async_idle_bfqq;

3581

-+	default:

3582

-+		BUG();

3583

-+	}

3584

-+}

3585

-+

3586

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

3587

-+				       struct bfq_group *bfqg, int is_sync,

3588

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask)

3589

-+{

3590

-+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3591

-+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3592

-+	struct bfq_queue **async_bfqq = NULL;

3593

-+	struct bfq_queue *bfqq = NULL;

3594

-+

3595

-+	if (!is_sync) {

3596

-+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

3597

-+						  ioprio);

3598

-+		bfqq = *async_bfqq;

3599

-+	}

3600

-+

3601

-+	if (bfqq == NULL)

3602

-+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3603

-+

3604

-+	/*

3605

-+	 * Pin the queue now that it's allocated, scheduler exit will prune it.

3606

-+	 */

3607

-+	if (!is_sync && *async_bfqq == NULL) {

3608

-+		atomic_inc(&bfqq->ref);

3609

-+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

3610

-+			     bfqq, atomic_read(&bfqq->ref));

3611

-+		*async_bfqq = bfqq;

3612

-+	}

3613

-+

3614

-+	atomic_inc(&bfqq->ref);

3615

-+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

3616

-+		     atomic_read(&bfqq->ref));

3617

-+	return bfqq;

3618

-+}

3619

-+

3620

-+static void bfq_update_io_thinktime(struct bfq_data *bfqd,

3621

-+				    struct bfq_io_cq *bic)

3622

-+{

3623

-+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

3624

-+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

3625

-+

3626

-+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

3627

-+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

3628

-+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

3629

-+				bic->ttime.ttime_samples;

3630

-+}

3631

-+

3632

-+static void bfq_update_io_seektime(struct bfq_data *bfqd,

3633

-+				   struct bfq_queue *bfqq,

3634

-+				   struct request *rq)

3635

-+{

3636

-+	sector_t sdist;

3637

-+	u64 total;

3638

-+

3639

-+	if (bfqq->last_request_pos < blk_rq_pos(rq))

3640

-+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

3641

-+	else

3642

-+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

3643

-+

3644

-+	/*

3645

-+	 * Don't allow the seek distance to get too large from the

3646

-+	 * odd fragment, pagein, etc.

3647

-+	 */

3648

-+	if (bfqq->seek_samples == 0) /* first request, not really a seek */

3649

-+		sdist = 0;

3650

-+	else if (bfqq->seek_samples <= 60) /* second & third seek */

3651

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

3652

-+	else

3653

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

3654

-+

3655

-+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

3656

-+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

3657

-+	total = bfqq->seek_total + (bfqq->seek_samples/2);

3658

-+	do_div(total, bfqq->seek_samples);

3659

-+	bfqq->seek_mean = (sector_t)total;

3660

-+

3661

-+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

3662

-+			(u64)bfqq->seek_mean);

3663

-+}

3664

-+

3665

-+/*

3666

-+ * Disable idle window if the process thinks too long or seeks so much that

3667

-+ * it doesn't matter.

3668

-+ */

3669

-+static void bfq_update_idle_window(struct bfq_data *bfqd,

3670

-+				   struct bfq_queue *bfqq,

3671

-+				   struct bfq_io_cq *bic)

3672

-+{

3673

-+	int enable_idle;

3674

-+

3675

-+	/* Don't idle for async or idle io prio class. */

3676

-+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

3677

-+		return;

3678

-+

3679

-+	enable_idle = bfq_bfqq_idle_window(bfqq);

3680

-+

3681

-+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

3682

-+	    bfqd->bfq_slice_idle == 0 ||

3683

-+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

3684

-+			bfqq->raising_coeff == 1))

3685

-+		enable_idle = 0;

3686

-+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

3687

-+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

3688

-+			bfqq->raising_coeff == 1)

3689

-+			enable_idle = 0;

3690

-+		else

3691

-+			enable_idle = 1;

3692

-+	}

3693

-+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

3694

-+		enable_idle);

3695

-+

3696

-+	if (enable_idle)

3697

-+		bfq_mark_bfqq_idle_window(bfqq);

3698

-+	else

3699

-+		bfq_clear_bfqq_idle_window(bfqq);

3700

-+}

3701

-+

3702

-+/*

3703

-+ * Called when a new fs request (rq) is added to bfqq.  Check if there's

3704

-+ * something we should do about it.

3705

-+ */

3706

-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3707

-+			    struct request *rq)

3708

-+{

3709

-+	struct bfq_io_cq *bic = RQ_BIC(rq);

3710

-+

3711

-+	if (rq->cmd_flags & REQ_META)

3712

-+		bfqq->meta_pending++;

3713

-+

3714

-+	bfq_update_io_thinktime(bfqd, bic);

3715

-+	bfq_update_io_seektime(bfqd, bfqq, rq);

3716

-+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

3717

-+	    !BFQQ_SEEKY(bfqq))

3718

-+		bfq_update_idle_window(bfqd, bfqq, bic);

3719

-+

3720

-+	bfq_log_bfqq(bfqd, bfqq,

3721

-+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

3722

-+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

3723

-+		     (long long unsigned)bfqq->seek_mean);

3724

-+

3725

-+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3726

-+

3727

-+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

3728

-+		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

3729

-+				blk_rq_sectors(rq) < 32;

3730

-+		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

3731

-+

3732

-+		/*

3733

-+		 * There is just this request queued: if the request

3734

-+		 * is small and the queue is not to be expired, then

3735

-+		 * just exit.

3736

-+		 *

3737

-+		 * In this way, if the disk is being idled to wait for

3738

-+		 * a new request from the in-service queue, we avoid

3739

-+		 * unplugging the device and committing the disk to serve

3740

-+		 * just a small request. On the contrary, we wait for

3741

-+		 * the block layer to decide when to unplug the device:

3742

-+		 * hopefully, new requests will be merged to this one

3743

-+		 * quickly, then the device will be unplugged and

3744

-+		 * larger requests will be dispatched.

3745

-+		 */

3746

-+		if (small_req && !budget_timeout)

3747

-+			return;

3748

-+

3749

-+		/*

3750

-+		 * A large enough request arrived, or the queue is to

3751

-+		 * be expired: in both cases disk idling is to be

3752

-+		 * stopped, so clear wait_request flag and reset

3753

-+		 * timer.

3754

-+		 */

3755

-+		bfq_clear_bfqq_wait_request(bfqq);

3756

-+		del_timer(&bfqd->idle_slice_timer);

3757

-+

3758

-+		/*

3759

-+		 * The queue is not empty, because a new request just

3760

-+		 * arrived. Hence we can safely expire the queue, in

3761

-+		 * case of budget timeout, without risking that the

3762

-+		 * timestamps of the queue are not updated correctly.

3763

-+		 * See [1] for more details.

3764

-+		 */

3765

-+		if (budget_timeout)

3766

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3767

-+

3768

-+		/*

3769

-+		 * Let the request rip immediately, or let a new queue be

3770

-+		 * selected if bfqq has just been expired.

3771

-+		 */

3772

-+		__blk_run_queue(bfqd->queue);

3773

-+	}

3774

-+}

3775

-+

3776

-+static void bfq_insert_request(struct request_queue *q, struct request *rq)

3777

-+{

3778

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3779

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3780

-+

3781

-+	assert_spin_locked(bfqd->queue->queue_lock);

3782

-+	bfq_init_prio_data(bfqq, RQ_BIC(rq));

3783

-+

3784

-+	bfq_add_rq_rb(rq);

3785

-+

3786

-+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

3787

-+	list_add_tail(&rq->queuelist, &bfqq->fifo);

3788

-+

3789

-+	bfq_rq_enqueued(bfqd, bfqq, rq);

3790

-+}

3791

-+

3792

-+static void bfq_update_hw_tag(struct bfq_data *bfqd)

3793

-+{

3794

-+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

3795

-+				     bfqd->rq_in_driver);

3796

-+

3797

-+	if (bfqd->hw_tag == 1)

3798

-+		return;

3799

-+

3800

-+	/*

3801

-+	 * This sample is valid if the number of outstanding requests

3802

-+	 * is large enough to allow a queueing behavior.  Note that the

3803

-+	 * sum is not exact, as it's not taking into account deactivated

3804

-+	 * requests.

3805

-+	 */

3806

-+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

3807

-+		return;

3808

-+

3809

-+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

3810

-+		return;

3811

-+

3812

-+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

3813

-+	bfqd->max_rq_in_driver = 0;

3814

-+	bfqd->hw_tag_samples = 0;

3815

-+}

3816

-+

3817

-+static void bfq_completed_request(struct request_queue *q, struct request *rq)

3818

-+{

3819

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3820

-+	struct bfq_data *bfqd = bfqq->bfqd;

3821

-+	const int sync = rq_is_sync(rq);

3822

-+

3823

-+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

3824

-+			blk_rq_sectors(rq), sync);

3825

-+

3826

-+	bfq_update_hw_tag(bfqd);

3827

-+

3828

-+	WARN_ON(!bfqd->rq_in_driver);

3829

-+	WARN_ON(!bfqq->dispatched);

3830

-+	bfqd->rq_in_driver--;

3831

-+	bfqq->dispatched--;

3832

-+

3833

-+	if (bfq_bfqq_sync(bfqq))

3834

-+		bfqd->sync_flight--;

3835

-+

3836

-+	if (sync)

3837

-+		RQ_BIC(rq)->ttime.last_end_request = jiffies;

3838

-+

3839

-+	/*

3840

-+	 * The computation of softrt_next_start was scheduled for the next

3841

-+	 * request completion: it is now time to compute it.

3842

-+	 */

3843

-+	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

3844

-+		bfqq->soft_rt_next_start =

3845

-+			bfq_bfqq_softrt_next_start(bfqd, bfqq);

3846

-+

3847

-+	/*

3848

-+	 * If this is the in-service queue, check if it needs to be expired,

3849

-+	 * or if we want to idle in case it has no pending requests.

3850

-+	 */

3851

-+	if (bfqd->in_service_queue == bfqq) {

3852

-+		if (bfq_bfqq_budget_new(bfqq))

3853

-+			bfq_set_budget_timeout(bfqd);

3854

-+

3855

-+		if (bfq_bfqq_must_idle(bfqq)) {

3856

-+			bfq_arm_slice_timer(bfqd);

3857

-+			goto out;

3858

-+		} else if (bfq_may_expire_for_budg_timeout(bfqq))

3859

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3860

-+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

3861

-+			 (bfqq->dispatched == 0 ||

3862

-+			  !bfq_bfqq_must_not_expire(bfqq)))

3863

-+			bfq_bfqq_expire(bfqd, bfqq, 0,

3864

-+					BFQ_BFQQ_NO_MORE_REQUESTS);

3865

-+	}

3866

-+

3867

-+	if (!bfqd->rq_in_driver)

3868

-+		bfq_schedule_dispatch(bfqd);

3869

-+

3870

-+out:

3871

-+	return;

3872

-+}

3873

-+

3874

-+static inline int __bfq_may_queue(struct bfq_queue *bfqq)

3875

-+{

3876

-+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

3877

-+		bfq_clear_bfqq_must_alloc(bfqq);

3878

-+		return ELV_MQUEUE_MUST;

3879

-+	}

3880

-+

3881

-+	return ELV_MQUEUE_MAY;

3882

-+}

3883

-+

3884

-+static int bfq_may_queue(struct request_queue *q, int rw)

3885

-+{

3886

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3887

-+	struct task_struct *tsk = current;

3888

-+	struct bfq_io_cq *bic;

3889

-+	struct bfq_queue *bfqq;

3890

-+

3891

-+	/*

3892

-+	 * Don't force setup of a queue from here, as a call to may_queue

3893

-+	 * does not necessarily imply that a request actually will be queued.

3894

-+	 * So just lookup a possibly existing queue, or return 'may queue'

3895

-+	 * if that fails.

3896

-+	 */

3897

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

3898

-+	if (bic == NULL)

3899

-+		return ELV_MQUEUE_MAY;

3900

-+

3901

-+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

3902

-+	if (bfqq != NULL) {

3903

-+		bfq_init_prio_data(bfqq, bic);

3904

-+

3905

-+		return __bfq_may_queue(bfqq);

3906

-+	}

3907

-+

3908

-+	return ELV_MQUEUE_MAY;

3909

-+}

3910

-+

3911

-+/*

3912

-+ * Queue lock held here.

3913

-+ */

3914

-+static void bfq_put_request(struct request *rq)

3915

-+{

3916

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3917

-+

3918

-+	if (bfqq != NULL) {

3919

-+		const int rw = rq_data_dir(rq);

3920

-+

3921

-+		BUG_ON(!bfqq->allocated[rw]);

3922

-+		bfqq->allocated[rw]--;

3923

-+

3924

-+		rq->elv.priv[0] = NULL;

3925

-+		rq->elv.priv[1] = NULL;

3926

-+

3927

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

3928

-+			     bfqq, atomic_read(&bfqq->ref));

3929

-+		bfq_put_queue(bfqq);

3930

-+	}

3931

-+}

3932

-+

3933

-+static struct bfq_queue *

3934

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

3935

-+		struct bfq_queue *bfqq)

3936

-+{

3937

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

3938

-+		(long unsigned)bfqq->new_bfqq->pid);

3939

-+	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

3940

-+	bfq_mark_bfqq_coop(bfqq->new_bfqq);

3941

-+	bfq_put_queue(bfqq);

3942

-+	return bic_to_bfqq(bic, 1);

3943

-+}

3944

-+

3945

-+/*

3946

-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

3947

-+ * was the last process referring to said bfqq.

3948

-+ */

3949

-+static struct bfq_queue *

3950

-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

3951

-+{

3952

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

3953

-+	if (bfqq_process_refs(bfqq) == 1) {

3954

-+		bfqq->pid = current->pid;

3955

-+		bfq_clear_bfqq_coop(bfqq);

3956

-+		bfq_clear_bfqq_split_coop(bfqq);

3957

-+		return bfqq;

3958

-+	}

3959

-+

3960

-+	bic_set_bfqq(bic, NULL, 1);

3961

-+

3962

-+	bfq_put_cooperator(bfqq);

3963

-+

3964

-+	bfq_put_queue(bfqq);

3965

-+	return NULL;

3966

-+}

3967

-+

3968

-+/*

3969

-+ * Allocate bfq data structures associated with this request.

3970

-+ */

3971

-+static int bfq_set_request(struct request_queue *q, struct request *rq,

3972

-+			   struct bio *bio, gfp_t gfp_mask)

3973

-+{

3974

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3975

-+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

3976

-+	const int rw = rq_data_dir(rq);

3977

-+	const int is_sync = rq_is_sync(rq);

3978

-+	struct bfq_queue *bfqq;

3979

-+	struct bfq_group *bfqg;

3980

-+	unsigned long flags;

3981

-+

3982

-+	might_sleep_if(gfp_mask & __GFP_WAIT);

3983

-+

3984

-+	bfq_changed_ioprio(bic);

3985

-+

3986

-+	spin_lock_irqsave(q->queue_lock, flags);

3987

-+

3988

-+	if (bic == NULL)

3989

-+		goto queue_fail;

3990

-+

3991

-+	bfqg = bfq_bic_update_cgroup(bic);

3992

-+

3993

-+new_queue:

3994

-+	bfqq = bic_to_bfqq(bic, is_sync);

3995

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3996

-+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3997

-+		bic_set_bfqq(bic, bfqq, is_sync);

3998

-+	} else {

3999

-+		/*

4000

-+		 * If the queue was seeky for too long, break it apart.

4001

-+		 */

4002

-+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4003

-+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4004

-+			bfqq = bfq_split_bfqq(bic, bfqq);

4005

-+			if (!bfqq)

4006

-+				goto new_queue;

4007

-+		}

4008

-+

4009

-+		/*

4010

-+		 * Check to see if this queue is scheduled to merge with

4011

-+		 * another closely cooperating queue. The merging of queues

4012

-+		 * happens here as it must be done in process context.

4013

-+		 * The reference on new_bfqq was taken in merge_bfqqs.

4014

-+		 */

4015

-+		if (bfqq->new_bfqq != NULL)

4016

-+			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4017

-+	}

4018

-+

4019

-+	bfqq->allocated[rw]++;

4020

-+	atomic_inc(&bfqq->ref);

4021

-+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4022

-+		     atomic_read(&bfqq->ref));

4023

-+

4024

-+	rq->elv.priv[0] = bic;

4025

-+	rq->elv.priv[1] = bfqq;

4026

-+

4027

-+	spin_unlock_irqrestore(q->queue_lock, flags);

4028

-+

4029

-+	return 0;

4030

-+

4031

-+queue_fail:

4032

-+	bfq_schedule_dispatch(bfqd);

4033

-+	spin_unlock_irqrestore(q->queue_lock, flags);

4034

-+

4035

-+	return 1;

4036

-+}

4037

-+

4038

-+static void bfq_kick_queue(struct work_struct *work)

4039

-+{

4040

-+	struct bfq_data *bfqd =

4041

-+		container_of(work, struct bfq_data, unplug_work);

4042

-+	struct request_queue *q = bfqd->queue;

4043

-+

4044

-+	spin_lock_irq(q->queue_lock);

4045

-+	__blk_run_queue(q);

4046

-+	spin_unlock_irq(q->queue_lock);

4047

-+}

4048

-+

4049

-+/*

4050

-+ * Handler of the expiration of the timer running if the in-service queue

4051

-+ * is idling inside its time slice.

4052

-+ */

4053

-+static void bfq_idle_slice_timer(unsigned long data)

4054

-+{

4055

-+	struct bfq_data *bfqd = (struct bfq_data *)data;

4056

-+	struct bfq_queue *bfqq;

4057

-+	unsigned long flags;

4058

-+	enum bfqq_expiration reason;

4059

-+

4060

-+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4061

-+

4062

-+	bfqq = bfqd->in_service_queue;

4063

-+	/*

4064

-+	 * Theoretical race here: the in-service queue can be NULL or different

4065

-+	 * from the queue that was idling if the timer handler spins on

4066

-+	 * the queue_lock and a new request arrives for the current

4067

-+	 * queue and there is a full dispatch cycle that changes the

4068

-+	 * in-service queue.  This can hardly happen, but in the worst case

4069

-+	 * we just expire a queue too early.

4070

-+	 */

4071

-+	if (bfqq != NULL) {

4072

-+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4073

-+		if (bfq_bfqq_budget_timeout(bfqq))

4074

-+			/*

4075

-+			 * Also here the queue can be safely expired

4076

-+			 * for budget timeout without wasting

4077

-+			 * guarantees

4078

-+			 */

4079

-+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4080

-+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4081

-+			/*

4082

-+			 * The queue may not be empty upon timer expiration,

4083

-+			 * because we may not disable the timer when the first

4084

-+			 * request of the in-service queue arrives during

4085

-+			 * disk idling

4086

-+			 */

4087

-+			reason = BFQ_BFQQ_TOO_IDLE;

4088

-+		else

4089

-+			goto schedule_dispatch;

4090

-+

4091

-+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4092

-+	}

4093

-+

4094

-+schedule_dispatch:

4095

-+	bfq_schedule_dispatch(bfqd);

4096

-+

4097

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4098

-+}

4099

-+

4100

-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4101

-+{

4102

-+	del_timer_sync(&bfqd->idle_slice_timer);

4103

-+	cancel_work_sync(&bfqd->unplug_work);

4104

-+}

4105

-+

4106

-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4107

-+					struct bfq_queue **bfqq_ptr)

4108

-+{

4109

-+	struct bfq_group *root_group = bfqd->root_group;

4110

-+	struct bfq_queue *bfqq = *bfqq_ptr;

4111

-+

4112

-+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4113

-+	if (bfqq != NULL) {

4114

-+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4115

-+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4116

-+			     bfqq, atomic_read(&bfqq->ref));

4117

-+		bfq_put_queue(bfqq);

4118

-+		*bfqq_ptr = NULL;

4119

-+	}

4120

-+}

4121

-+

4122

-+/*

4123

-+ * Release all the bfqg references to its async queues.  If we are

4124

-+ * deallocating the group these queues may still contain requests, so

4125

-+ * we reparent them to the root cgroup (i.e., the only one that will

4126

-+ * exist for sure untill all the requests on a device are gone).

4127

-+ */

4128

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4129

-+{

4130

-+	int i, j;

4131

-+

4132

-+	for (i = 0; i < 2; i++)

4133

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

4134

-+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4135

-+

4136

-+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4137

-+}

4138

-+

4139

-+static void bfq_exit_queue(struct elevator_queue *e)

4140

-+{

4141

-+	struct bfq_data *bfqd = e->elevator_data;

4142

-+	struct request_queue *q = bfqd->queue;

4143

-+	struct bfq_queue *bfqq, *n;

4144

-+

4145

-+	bfq_shutdown_timer_wq(bfqd);

4146

-+

4147

-+	spin_lock_irq(q->queue_lock);

4148

-+

4149

-+	BUG_ON(bfqd->in_service_queue != NULL);

4150

-+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4151

-+		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4152

-+

4153

-+	bfq_disconnect_groups(bfqd);

4154

-+	spin_unlock_irq(q->queue_lock);

4155

-+

4156

-+	bfq_shutdown_timer_wq(bfqd);

4157

-+

4158

-+	synchronize_rcu();

4159

-+

4160

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4161

-+

4162

-+	bfq_free_root_group(bfqd);

4163

-+	kfree(bfqd);

4164

-+}

4165

-+

4166

-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4167

-+{

4168

-+	struct bfq_group *bfqg;

4169

-+	struct bfq_data *bfqd;

4170

-+	struct elevator_queue *eq;

4171

-+

4172

-+	eq = elevator_alloc(q, e);

4173

-+	if (eq == NULL)

4174

-+		return -ENOMEM;

4175

-+

4176

-+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4177

-+	if (bfqd == NULL) {

4178

-+		kobject_put(&eq->kobj);

4179

-+		return -ENOMEM;

4180

-+	}

4181

-+	eq->elevator_data = bfqd;

4182

-+

4183

-+	/*

4184

-+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4185

-+	 * Grab a permanent reference to it, so that the normal code flow

4186

-+	 * will not attempt to free it.

4187

-+	 */

4188

-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

4189

-+	atomic_inc(&bfqd->oom_bfqq.ref);

4190

-+

4191

-+	bfqd->queue = q;

4192

-+

4193

-+	spin_lock_irq(q->queue_lock);

4194

-+	q->elevator = eq;

4195

-+	spin_unlock_irq(q->queue_lock);

4196

-+

4197

-+	bfqg = bfq_alloc_root_group(bfqd, q->node);

4198

-+	if (bfqg == NULL) {

4199

-+		kfree(bfqd);

4200

-+		kobject_put(&eq->kobj);

4201

-+		return -ENOMEM;

4202

-+	}

4203

-+

4204

-+	bfqd->root_group = bfqg;

4205

-+

4206

-+	init_timer(&bfqd->idle_slice_timer);

4207

-+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4208

-+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4209

-+

4210

-+	bfqd->rq_pos_tree = RB_ROOT;

4211

-+

4212

-+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4213

-+

4214

-+	INIT_LIST_HEAD(&bfqd->active_list);

4215

-+	INIT_LIST_HEAD(&bfqd->idle_list);

4216

-+

4217

-+	bfqd->hw_tag = -1;

4218

-+

4219

-+	bfqd->bfq_max_budget = bfq_default_max_budget;

4220

-+

4221

-+	bfqd->bfq_quantum = bfq_quantum;

4222

-+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4223

-+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4224

-+	bfqd->bfq_back_max = bfq_back_max;

4225

-+	bfqd->bfq_back_penalty = bfq_back_penalty;

4226

-+	bfqd->bfq_slice_idle = bfq_slice_idle;

4227

-+	bfqd->bfq_class_idle_last_service = 0;

4228

-+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4229

-+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4230

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4231

-+

4232

-+	bfqd->low_latency = true;

4233

-+

4234

-+	bfqd->bfq_raising_coeff = 20;

4235

-+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

4236

-+	bfqd->bfq_raising_max_time = 0;

4237

-+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

4238

-+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

4239

-+	bfqd->bfq_raising_max_softrt_rate = 7000; /*

4240

-+						   * Approximate rate required

4241

-+						   * to playback or record a

4242

-+						   * high-definition compressed

4243

-+						   * video.

4244

-+						   */

4245

-+	bfqd->raised_busy_queues = 0;

4246

-+

4247

-+	/* Initially estimate the device's peak rate as the reference rate */

4248

-+	if (blk_queue_nonrot(bfqd->queue)) {

4249

-+		bfqd->RT_prod = R_nonrot * T_nonrot;

4250

-+		bfqd->peak_rate = R_nonrot;

4251

-+	} else {

4252

-+		bfqd->RT_prod = R_rot * T_rot;

4253

-+		bfqd->peak_rate = R_rot;

4254

-+	}

4255

-+

4256

-+	return 0;

4257

-+}

4258

-+

4259

-+static void bfq_slab_kill(void)

4260

-+{

4261

-+	if (bfq_pool != NULL)

4262

-+		kmem_cache_destroy(bfq_pool);

4263

-+}

4264

-+

4265

-+static int __init bfq_slab_setup(void)

4266

-+{

4267

-+	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4268

-+	if (bfq_pool == NULL)

4269

-+		return -ENOMEM;

4270

-+	return 0;

4271

-+}

4272

-+

4273

-+static ssize_t bfq_var_show(unsigned int var, char *page)

4274

-+{

4275

-+	return sprintf(page, "%d\n", var);

4276

-+}

4277

-+

4278

-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

4279

-+{

4280

-+	unsigned long new_val;

4281

-+	int ret = kstrtoul(page, 10, &new_val);

4282

-+

4283

-+	if (ret == 0)

4284

-+		*var = new_val;

4285

-+

4286

-+	return count;

4287

-+}

4288

-+

4289

-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

4290

-+{

4291

-+	struct bfq_data *bfqd = e->elevator_data;

4292

-+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

4293

-+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

4294

-+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

4295

-+}

4296

-+

4297

-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4298

-+{

4299

-+	struct bfq_queue *bfqq;

4300

-+	struct bfq_data *bfqd = e->elevator_data;

4301

-+	ssize_t num_char = 0;

4302

-+

4303

-+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4304

-+			    bfqd->queued);

4305

-+

4306

-+	spin_lock_irq(bfqd->queue->queue_lock);

4307

-+

4308

-+	num_char += sprintf(page + num_char, "Active:\n");

4309

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4310

-+	  num_char += sprintf(page + num_char,

4311

-+			      "pid%d: weight %hu, nr_queued %d %d,"

4312

-+			      " dur %d/%u\n",

4313

-+			      bfqq->pid,

4314

-+			      bfqq->entity.weight,

4315

-+			      bfqq->queued[0],

4316

-+			      bfqq->queued[1],

4317

-+			jiffies_to_msecs(jiffies -

4318

-+				bfqq->last_rais_start_finish),

4319

-+			jiffies_to_msecs(bfqq->raising_cur_max_time));

4320

-+	}

4321

-+

4322

-+	num_char += sprintf(page + num_char, "Idle:\n");

4323

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4324

-+			num_char += sprintf(page + num_char,

4325

-+				"pid%d: weight %hu, dur %d/%u\n",

4326

-+				bfqq->pid,

4327

-+				bfqq->entity.weight,

4328

-+				jiffies_to_msecs(jiffies -

4329

-+					bfqq->last_rais_start_finish),

4330

-+				jiffies_to_msecs(bfqq->raising_cur_max_time));

4331

-+	}

4332

-+

4333

-+	spin_unlock_irq(bfqd->queue->queue_lock);

4334

-+

4335

-+	return num_char;

4336

-+}

4337

-+

4338

-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4339

-+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4340

-+{									\

4341

-+	struct bfq_data *bfqd = e->elevator_data;			\

4342

-+	unsigned int __data = __VAR;					\

4343

-+	if (__CONV)							\

4344

-+		__data = jiffies_to_msecs(__data);			\

4345

-+	return bfq_var_show(__data, (page));				\

4346

-+}

4347

-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4348

-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4349

-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4350

-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4351

-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4352

-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4353

-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4354

-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

4355

-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4356

-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4357

-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4358

-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

4359

-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

4360

-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

4361

-+	1);

4362

-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

4363

-+	bfqd->bfq_raising_min_inter_arr_async,

4364

-+	1);

4365

-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

4366

-+	bfqd->bfq_raising_max_softrt_rate, 0);

4367

-+#undef SHOW_FUNCTION

4368

-+

4369

-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4370

-+static ssize_t								\

4371

-+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4372

-+{									\

4373

-+	struct bfq_data *bfqd = e->elevator_data;			\

4374

-+	unsigned long uninitialized_var(__data);			\

4375

-+	int ret = bfq_var_store(&__data, (page), count);		\

4376

-+	if (__data < (MIN))						\

4377

-+		__data = (MIN);						\

4378

-+	else if (__data > (MAX))					\

4379

-+		__data = (MAX);						\

4380

-+	if (__CONV)							\

4381

-+		*(__PTR) = msecs_to_jiffies(__data);			\

4382

-+	else								\

4383

-+		*(__PTR) = __data;					\

4384

-+	return ret;							\

4385

-+}

4386

-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4387

-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4388

-+		INT_MAX, 1);

4389

-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4390

-+		INT_MAX, 1);

4391

-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4392

-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4393

-+		INT_MAX, 0);

4394

-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4395

-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4396

-+		1, INT_MAX, 0);

4397

-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4398

-+		INT_MAX, 1);

4399

-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

4400

-+		INT_MAX, 0);

4401

-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

4402

-+		INT_MAX, 1);

4403

-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

4404

-+		INT_MAX, 1);

4405

-+STORE_FUNCTION(bfq_raising_min_idle_time_store,

4406

-+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

4407

-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

4408

-+		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

4409

-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

4410

-+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

4411

-+#undef STORE_FUNCTION

4412

-+

4413

-+/* do nothing for the moment */

4414

-+static ssize_t bfq_weights_store(struct elevator_queue *e,

4415

-+				    const char *page, size_t count)

4416

-+{

4417

-+	return count;

4418

-+}

4419

-+

4420

-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4421

-+{

4422

-+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4423

-+

4424

-+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4425

-+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4426

-+	else

4427

-+		return bfq_default_max_budget;

4428

-+}

4429

-+

4430

-+static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4431

-+				    const char *page, size_t count)

4432

-+{

4433

-+	struct bfq_data *bfqd = e->elevator_data;

4434

-+	unsigned long uninitialized_var(__data);

4435

-+	int ret = bfq_var_store(&__data, (page), count);

4436

-+

4437

-+	if (__data == 0)

4438

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4439

-+	else {

4440

-+		if (__data > INT_MAX)

4441

-+			__data = INT_MAX;

4442

-+		bfqd->bfq_max_budget = __data;

4443

-+	}

4444

-+

4445

-+	bfqd->bfq_user_max_budget = __data;

4446

-+

4447

-+	return ret;

4448

-+}

4449

-+

4450

-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4451

-+				      const char *page, size_t count)

4452

-+{

4453

-+	struct bfq_data *bfqd = e->elevator_data;

4454

-+	unsigned long uninitialized_var(__data);

4455

-+	int ret = bfq_var_store(&__data, (page), count);

4456

-+

4457

-+	if (__data < 1)

4458

-+		__data = 1;

4459

-+	else if (__data > INT_MAX)

4460

-+		__data = INT_MAX;

4461

-+

4462

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4463

-+	if (bfqd->bfq_user_max_budget == 0)

4464

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4465

-+

4466

-+	return ret;

4467

-+}

4468

-+

4469

-+static ssize_t bfq_low_latency_store(struct elevator_queue *e,

4470

-+				     const char *page, size_t count)

4471

-+{

4472

-+	struct bfq_data *bfqd = e->elevator_data;

4473

-+	unsigned long uninitialized_var(__data);

4474

-+	int ret = bfq_var_store(&__data, (page), count);

4475

-+

4476

-+	if (__data > 1)

4477

-+		__data = 1;

4478

-+	if (__data == 0 && bfqd->low_latency != 0)

4479

-+		bfq_end_raising(bfqd);

4480

-+	bfqd->low_latency = __data;

4481

-+

4482

-+	return ret;

4483

-+}

4484

-+

4485

-+#define BFQ_ATTR(name) \

4486

-+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

4487

-+

4488

-+static struct elv_fs_entry bfq_attrs[] = {

4489

-+	BFQ_ATTR(quantum),

4490

-+	BFQ_ATTR(fifo_expire_sync),

4491

-+	BFQ_ATTR(fifo_expire_async),

4492

-+	BFQ_ATTR(back_seek_max),

4493

-+	BFQ_ATTR(back_seek_penalty),

4494

-+	BFQ_ATTR(slice_idle),

4495

-+	BFQ_ATTR(max_budget),

4496

-+	BFQ_ATTR(max_budget_async_rq),

4497

-+	BFQ_ATTR(timeout_sync),

4498

-+	BFQ_ATTR(timeout_async),

4499

-+	BFQ_ATTR(low_latency),

4500

-+	BFQ_ATTR(raising_coeff),

4501

-+	BFQ_ATTR(raising_max_time),

4502

-+	BFQ_ATTR(raising_rt_max_time),

4503

-+	BFQ_ATTR(raising_min_idle_time),

4504

-+	BFQ_ATTR(raising_min_inter_arr_async),

4505

-+	BFQ_ATTR(raising_max_softrt_rate),

4506

-+	BFQ_ATTR(weights),

4507

-+	__ATTR_NULL

4508

-+};

4509

-+

4510

-+static struct elevator_type iosched_bfq = {

4511

-+	.ops = {

4512

-+		.elevator_merge_fn =		bfq_merge,

4513

-+		.elevator_merged_fn =		bfq_merged_request,

4514

-+		.elevator_merge_req_fn =	bfq_merged_requests,

4515

-+		.elevator_allow_merge_fn =	bfq_allow_merge,

4516

-+		.elevator_dispatch_fn =		bfq_dispatch_requests,

4517

-+		.elevator_add_req_fn =		bfq_insert_request,

4518

-+		.elevator_activate_req_fn =	bfq_activate_request,

4519

-+		.elevator_deactivate_req_fn =	bfq_deactivate_request,

4520

-+		.elevator_completed_req_fn =	bfq_completed_request,

4521

-+		.elevator_former_req_fn =	elv_rb_former_request,

4522

-+		.elevator_latter_req_fn =	elv_rb_latter_request,

4523

-+		.elevator_init_icq_fn =		bfq_init_icq,

4524

-+		.elevator_exit_icq_fn =		bfq_exit_icq,

4525

-+		.elevator_set_req_fn =		bfq_set_request,

4526

-+		.elevator_put_req_fn =		bfq_put_request,

4527

-+		.elevator_may_queue_fn =	bfq_may_queue,

4528

-+		.elevator_init_fn =		bfq_init_queue,

4529

-+		.elevator_exit_fn =		bfq_exit_queue,

4530

-+	},

4531

-+	.icq_size =		sizeof(struct bfq_io_cq),

4532

-+	.icq_align =		__alignof__(struct bfq_io_cq),

4533

-+	.elevator_attrs =	bfq_attrs,

4534

-+	.elevator_name =	"bfq",

4535

-+	.elevator_owner =	THIS_MODULE,

4536

-+};

4537

-+

4538

-+static int __init bfq_init(void)

4539

-+{

4540

-+	/*

4541

-+	 * Can be 0 on HZ < 1000 setups.

4542

-+	 */

4543

-+	if (bfq_slice_idle == 0)

4544

-+		bfq_slice_idle = 1;

4545

-+

4546

-+	if (bfq_timeout_async == 0)

4547

-+		bfq_timeout_async = 1;

4548

-+

4549

-+	if (bfq_slab_setup())

4550

-+		return -ENOMEM;

4551

-+

4552

-+	elv_register(&iosched_bfq);

4553

-+	printk(KERN_INFO "BFQ I/O-scheduler version: v7");

4554

-+

4555

-+	return 0;

4556

-+}

4557

-+

4558

-+static void __exit bfq_exit(void)

4559

-+{

4560

-+	elv_unregister(&iosched_bfq);

4561

-+	bfq_slab_kill();

4562

-+}

4563

-+

4564

-+module_init(bfq_init);

4565

-+module_exit(bfq_exit);

4566

-+

4567

-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

4568

-+MODULE_LICENSE("GPL");

4569

-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

4570

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

4571

-new file mode 100644

4572

-index 0000000..30df81c

4573

---- /dev/null

4574

-+++ b/block/bfq-sched.c

4575

-@@ -0,0 +1,1077 @@

4576

-+/*

4577

-+ * BFQ: Hierarchical B-WF2Q+ scheduler.

4578

-+ *

4579

-+ * Based on ideas and code from CFQ:

4580

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

4581

-+ *

4582

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

4583

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

4584

-+ *

4585

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

4586

-+ */

4587

-+

4588

-+#ifdef CONFIG_CGROUP_BFQIO

4589

-+#define for_each_entity(entity)	\

4590

-+	for (; entity != NULL; entity = entity->parent)

4591

-+

4592

-+#define for_each_entity_safe(entity, parent) \

4593

-+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

4594

-+

4595

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

4596

-+						 int extract,

4597

-+						 struct bfq_data *bfqd);

4598

-+

4599

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4600

-+{

4601

-+	struct bfq_entity *bfqg_entity;

4602

-+	struct bfq_group *bfqg;

4603

-+	struct bfq_sched_data *group_sd;

4604

-+

4605

-+	BUG_ON(next_active == NULL);

4606

-+

4607

-+	group_sd = next_active->sched_data;

4608

-+

4609

-+	bfqg = container_of(group_sd, struct bfq_group, sched_data);

4610

-+	/*

4611

-+	 * bfq_group's my_entity field is not NULL only if the group

4612

-+	 * is not the root group. We must not touch the root entity

4613

-+	 * as it must never become an active entity.

4614

-+	 */

4615

-+	bfqg_entity = bfqg->my_entity;

4616

-+	if (bfqg_entity != NULL)

4617

-+		bfqg_entity->budget = next_active->budget;

4618

-+}

4619

-+

4620

-+static int bfq_update_next_active(struct bfq_sched_data *sd)

4621

-+{

4622

-+	struct bfq_entity *next_active;

4623

-+

4624

-+	if (sd->active_entity != NULL)

4625

-+		/* will update/requeue at the end of service */

4626

-+		return 0;

4627

-+

4628

-+	/*

4629

-+	 * NOTE: this can be improved in many ways, such as returning

4630

-+	 * 1 (and thus propagating upwards the update) only when the

4631

-+	 * budget changes, or caching the bfqq that will be scheduled

4632

-+	 * next from this subtree.  By now we worry more about

4633

-+	 * correctness than about performance...

4634

-+	 */

4635

-+	next_active = bfq_lookup_next_entity(sd, 0, NULL);

4636

-+	sd->next_active = next_active;

4637

-+

4638

-+	if (next_active != NULL)

4639

-+		bfq_update_budget(next_active);

4640

-+

4641

-+	return 1;

4642

-+}

4643

-+

4644

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4645

-+					 struct bfq_entity *entity)

4646

-+{

4647

-+	BUG_ON(sd->next_active != entity);

4648

-+}

4649

-+#else

4650

-+#define for_each_entity(entity)	\

4651

-+	for (; entity != NULL; entity = NULL)

4652

-+

4653

-+#define for_each_entity_safe(entity, parent) \

4654

-+	for (parent = NULL; entity != NULL; entity = parent)

4655

-+

4656

-+static inline int bfq_update_next_active(struct bfq_sched_data *sd)

4657

-+{

4658

-+	return 0;

4659

-+}

4660

-+

4661

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4662

-+					 struct bfq_entity *entity)

4663

-+{

4664

-+}

4665

-+

4666

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4667

-+{

4668

-+}

4669

-+#endif

4670

-+

4671

-+/*

4672

-+ * Shift for timestamp calculations.  This actually limits the maximum

4673

-+ * service allowed in one timestamp delta (small shift values increase it),

4674

-+ * the maximum total weight that can be used for the queues in the system

4675

-+ * (big shift values increase it), and the period of virtual time wraparounds.

4676

-+ */

4677

-+#define WFQ_SERVICE_SHIFT	22

4678

-+

4679

-+/**

4680

-+ * bfq_gt - compare two timestamps.

4681

-+ * @a: first ts.

4682

-+ * @b: second ts.

4683

-+ *

4684

-+ * Return @a > @b, dealing with wrapping correctly.

4685

-+ */

4686

-+static inline int bfq_gt(u64 a, u64 b)

4687

-+{

4688

-+	return (s64)(a - b) > 0;

4689

-+}

4690

-+

4691

-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

4692

-+{

4693

-+	struct bfq_queue *bfqq = NULL;

4694

-+

4695

-+	BUG_ON(entity == NULL);

4696

-+

4697

-+	if (entity->my_sched_data == NULL)

4698

-+		bfqq = container_of(entity, struct bfq_queue, entity);

4699

-+

4700

-+	return bfqq;

4701

-+}

4702

-+

4703

-+

4704

-+/**

4705

-+ * bfq_delta - map service into the virtual time domain.

4706

-+ * @service: amount of service.

4707

-+ * @weight: scale factor (weight of an entity or weight sum).

4708

-+ */

4709

-+static inline u64 bfq_delta(unsigned long service,

4710

-+					unsigned long weight)

4711

-+{

4712

-+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

4713

-+

4714

-+	do_div(d, weight);

4715

-+	return d;

4716

-+}

4717

-+

4718

-+/**

4719

-+ * bfq_calc_finish - assign the finish time to an entity.

4720

-+ * @entity: the entity to act upon.

4721

-+ * @service: the service to be charged to the entity.

4722

-+ */

4723

-+static inline void bfq_calc_finish(struct bfq_entity *entity,

4724

-+				   unsigned long service)

4725

-+{

4726

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4727

-+

4728

-+	BUG_ON(entity->weight == 0);

4729

-+

4730

-+	entity->finish = entity->start +

4731

-+		bfq_delta(service, entity->weight);

4732

-+

4733

-+	if (bfqq != NULL) {

4734

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4735

-+			"calc_finish: serv %lu, w %d",

4736

-+			service, entity->weight);

4737

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4738

-+			"calc_finish: start %llu, finish %llu, delta %llu",

4739

-+			entity->start, entity->finish,

4740

-+			bfq_delta(service, entity->weight));

4741

-+	}

4742

-+}

4743

-+

4744

-+/**

4745

-+ * bfq_entity_of - get an entity from a node.

4746

-+ * @node: the node field of the entity.

4747

-+ *

4748

-+ * Convert a node pointer to the relative entity.  This is used only

4749

-+ * to simplify the logic of some functions and not as the generic

4750

-+ * conversion mechanism because, e.g., in the tree walking functions,

4751

-+ * the check for a %NULL value would be redundant.

4752

-+ */

4753

-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

4754

-+{

4755

-+	struct bfq_entity *entity = NULL;

4756

-+

4757

-+	if (node != NULL)

4758

-+		entity = rb_entry(node, struct bfq_entity, rb_node);

4759

-+

4760

-+	return entity;

4761

-+}

4762

-+

4763

-+/**

4764

-+ * bfq_extract - remove an entity from a tree.

4765

-+ * @root: the tree root.

4766

-+ * @entity: the entity to remove.

4767

-+ */

4768

-+static inline void bfq_extract(struct rb_root *root,

4769

-+			       struct bfq_entity *entity)

4770

-+{

4771

-+	BUG_ON(entity->tree != root);

4772

-+

4773

-+	entity->tree = NULL;

4774

-+	rb_erase(&entity->rb_node, root);

4775

-+}

4776

-+

4777

-+/**

4778

-+ * bfq_idle_extract - extract an entity from the idle tree.

4779

-+ * @st: the service tree of the owning @entity.

4780

-+ * @entity: the entity being removed.

4781

-+ */

4782

-+static void bfq_idle_extract(struct bfq_service_tree *st,

4783

-+			     struct bfq_entity *entity)

4784

-+{

4785

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4786

-+	struct rb_node *next;

4787

-+

4788

-+	BUG_ON(entity->tree != &st->idle);

4789

-+

4790

-+	if (entity == st->first_idle) {

4791

-+		next = rb_next(&entity->rb_node);

4792

-+		st->first_idle = bfq_entity_of(next);

4793

-+	}

4794

-+

4795

-+	if (entity == st->last_idle) {

4796

-+		next = rb_prev(&entity->rb_node);

4797

-+		st->last_idle = bfq_entity_of(next);

4798

-+	}

4799

-+

4800

-+	bfq_extract(&st->idle, entity);

4801

-+

4802

-+	if (bfqq != NULL)

4803

-+		list_del(&bfqq->bfqq_list);

4804

-+}

4805

-+

4806

-+/**

4807

-+ * bfq_insert - generic tree insertion.

4808

-+ * @root: tree root.

4809

-+ * @entity: entity to insert.

4810

-+ *

4811

-+ * This is used for the idle and the active tree, since they are both

4812

-+ * ordered by finish time.

4813

-+ */

4814

-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

4815

-+{

4816

-+	struct bfq_entity *entry;

4817

-+	struct rb_node **node = &root->rb_node;

4818

-+	struct rb_node *parent = NULL;

4819

-+

4820

-+	BUG_ON(entity->tree != NULL);

4821

-+

4822

-+	while (*node != NULL) {

4823

-+		parent = *node;

4824

-+		entry = rb_entry(parent, struct bfq_entity, rb_node);

4825

-+

4826

-+		if (bfq_gt(entry->finish, entity->finish))

4827

-+			node = &parent->rb_left;

4828

-+		else

4829

-+			node = &parent->rb_right;

4830

-+	}

4831

-+

4832

-+	rb_link_node(&entity->rb_node, parent, node);

4833

-+	rb_insert_color(&entity->rb_node, root);

4834

-+

4835

-+	entity->tree = root;

4836

-+}

4837

-+

4838

-+/**

4839

-+ * bfq_update_min - update the min_start field of a entity.

4840

-+ * @entity: the entity to update.

4841

-+ * @node: one of its children.

4842

-+ *

4843

-+ * This function is called when @entity may store an invalid value for

4844

-+ * min_start due to updates to the active tree.  The function  assumes

4845

-+ * that the subtree rooted at @node (which may be its left or its right

4846

-+ * child) has a valid min_start value.

4847

-+ */

4848

-+static inline void bfq_update_min(struct bfq_entity *entity,

4849

-+				  struct rb_node *node)

4850

-+{

4851

-+	struct bfq_entity *child;

4852

-+

4853

-+	if (node != NULL) {

4854

-+		child = rb_entry(node, struct bfq_entity, rb_node);

4855

-+		if (bfq_gt(entity->min_start, child->min_start))

4856

-+			entity->min_start = child->min_start;

4857

-+	}

4858

-+}

4859

-+

4860

-+/**

4861

-+ * bfq_update_active_node - recalculate min_start.

4862

-+ * @node: the node to update.

4863

-+ *

4864

-+ * @node may have changed position or one of its children may have moved,

4865

-+ * this function updates its min_start value.  The left and right subtrees

4866

-+ * are assumed to hold a correct min_start value.

4867

-+ */

4868

-+static inline void bfq_update_active_node(struct rb_node *node)

4869

-+{

4870

-+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

4871

-+

4872

-+	entity->min_start = entity->start;

4873

-+	bfq_update_min(entity, node->rb_right);

4874

-+	bfq_update_min(entity, node->rb_left);

4875

-+}

4876

-+

4877

-+/**

4878

-+ * bfq_update_active_tree - update min_start for the whole active tree.

4879

-+ * @node: the starting node.

4880

-+ *

4881

-+ * @node must be the deepest modified node after an update.  This function

4882

-+ * updates its min_start using the values held by its children, assuming

4883

-+ * that they did not change, and then updates all the nodes that may have

4884

-+ * changed in the path to the root.  The only nodes that may have changed

4885

-+ * are the ones in the path or their siblings.

4886

-+ */

4887

-+static void bfq_update_active_tree(struct rb_node *node)

4888

-+{

4889

-+	struct rb_node *parent;

4890

-+

4891

-+up:

4892

-+	bfq_update_active_node(node);

4893

-+

4894

-+	parent = rb_parent(node);

4895

-+	if (parent == NULL)

4896

-+		return;

4897

-+

4898

-+	if (node == parent->rb_left && parent->rb_right != NULL)

4899

-+		bfq_update_active_node(parent->rb_right);

4900

-+	else if (parent->rb_left != NULL)

4901

-+		bfq_update_active_node(parent->rb_left);

4902

-+

4903

-+	node = parent;

4904

-+	goto up;

4905

-+}

4906

-+

4907

-+/**

4908

-+ * bfq_active_insert - insert an entity in the active tree of its group/device.

4909

-+ * @st: the service tree of the entity.

4910

-+ * @entity: the entity being inserted.

4911

-+ *

4912

-+ * The active tree is ordered by finish time, but an extra key is kept

4913

-+ * per each node, containing the minimum value for the start times of

4914

-+ * its children (and the node itself), so it's possible to search for

4915

-+ * the eligible node with the lowest finish time in logarithmic time.

4916

-+ */

4917

-+static void bfq_active_insert(struct bfq_service_tree *st,

4918

-+			      struct bfq_entity *entity)

4919

-+{

4920

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4921

-+	struct rb_node *node = &entity->rb_node;

4922

-+

4923

-+	bfq_insert(&st->active, entity);

4924

-+

4925

-+	if (node->rb_left != NULL)

4926

-+		node = node->rb_left;

4927

-+	else if (node->rb_right != NULL)

4928

-+		node = node->rb_right;

4929

-+

4930

-+	bfq_update_active_tree(node);

4931

-+

4932

-+	if (bfqq != NULL)

4933

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

4934

-+}

4935

-+

4936

-+/**

4937

-+ * bfq_ioprio_to_weight - calc a weight from an ioprio.

4938

-+ * @ioprio: the ioprio value to convert.

4939

-+ */

4940

-+static unsigned short bfq_ioprio_to_weight(int ioprio)

4941

-+{

4942

-+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

4943

-+	return IOPRIO_BE_NR - ioprio;

4944

-+}

4945

-+

4946

-+/**

4947

-+ * bfq_weight_to_ioprio - calc an ioprio from a weight.

4948

-+ * @weight: the weight value to convert.

4949

-+ *

4950

-+ * To preserve as mush as possible the old only-ioprio user interface,

4951

-+ * 0 is used as an escape ioprio value for weights (numerically) equal or

4952

-+ * larger than IOPRIO_BE_NR

4953

-+ */

4954

-+static unsigned short bfq_weight_to_ioprio(int weight)

4955

-+{

4956

-+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

4957

-+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

4958

-+}

4959

-+

4960

-+static inline void bfq_get_entity(struct bfq_entity *entity)

4961

-+{

4962

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4963

-+	struct bfq_sched_data *sd;

4964

-+

4965

-+	if (bfqq != NULL) {

4966

-+		sd = entity->sched_data;

4967

-+		atomic_inc(&bfqq->ref);

4968

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

4969

-+			     bfqq, atomic_read(&bfqq->ref));

4970

-+	}

4971

-+}

4972

-+

4973

-+/**

4974

-+ * bfq_find_deepest - find the deepest node that an extraction can modify.

4975

-+ * @node: the node being removed.

4976

-+ *

4977

-+ * Do the first step of an extraction in an rb tree, looking for the

4978

-+ * node that will replace @node, and returning the deepest node that

4979

-+ * the following modifications to the tree can touch.  If @node is the

4980

-+ * last node in the tree return %NULL.

4981

-+ */

4982

-+static struct rb_node *bfq_find_deepest(struct rb_node *node)

4983

-+{

4984

-+	struct rb_node *deepest;

4985

-+

4986

-+	if (node->rb_right == NULL && node->rb_left == NULL)

4987

-+		deepest = rb_parent(node);

4988

-+	else if (node->rb_right == NULL)

4989

-+		deepest = node->rb_left;

4990

-+	else if (node->rb_left == NULL)

4991

-+		deepest = node->rb_right;

4992

-+	else {

4993

-+		deepest = rb_next(node);

4994

-+		if (deepest->rb_right != NULL)

4995

-+			deepest = deepest->rb_right;

4996

-+		else if (rb_parent(deepest) != node)

4997

-+			deepest = rb_parent(deepest);

4998

-+	}

4999

-+

5000

-+	return deepest;

5001

-+}

5002

-+

5003

-+/**

5004

-+ * bfq_active_extract - remove an entity from the active tree.

5005

-+ * @st: the service_tree containing the tree.

5006

-+ * @entity: the entity being removed.

5007

-+ */

5008

-+static void bfq_active_extract(struct bfq_service_tree *st,

5009

-+			       struct bfq_entity *entity)

5010

-+{

5011

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5012

-+	struct rb_node *node;

5013

-+

5014

-+	node = bfq_find_deepest(&entity->rb_node);

5015

-+	bfq_extract(&st->active, entity);

5016

-+

5017

-+	if (node != NULL)

5018

-+		bfq_update_active_tree(node);

5019

-+

5020

-+	if (bfqq != NULL)

5021

-+		list_del(&bfqq->bfqq_list);

5022

-+}

5023

-+

5024

-+/**

5025

-+ * bfq_idle_insert - insert an entity into the idle tree.

5026

-+ * @st: the service tree containing the tree.

5027

-+ * @entity: the entity to insert.

5028

-+ */

5029

-+static void bfq_idle_insert(struct bfq_service_tree *st,

5030

-+			    struct bfq_entity *entity)

5031

-+{

5032

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5033

-+	struct bfq_entity *first_idle = st->first_idle;

5034

-+	struct bfq_entity *last_idle = st->last_idle;

5035

-+

5036

-+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5037

-+		st->first_idle = entity;

5038

-+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5039

-+		st->last_idle = entity;

5040

-+

5041

-+	bfq_insert(&st->idle, entity);

5042

-+

5043

-+	if (bfqq != NULL)

5044

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5045

-+}

5046

-+

5047

-+/**

5048

-+ * bfq_forget_entity - remove an entity from the wfq trees.

5049

-+ * @st: the service tree.

5050

-+ * @entity: the entity being removed.

5051

-+ *

5052

-+ * Update the device status and forget everything about @entity, putting

5053

-+ * the device reference to it, if it is a queue.  Entities belonging to

5054

-+ * groups are not refcounted.

5055

-+ */

5056

-+static void bfq_forget_entity(struct bfq_service_tree *st,

5057

-+			      struct bfq_entity *entity)

5058

-+{

5059

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5060

-+	struct bfq_sched_data *sd;

5061

-+

5062

-+	BUG_ON(!entity->on_st);

5063

-+

5064

-+	entity->on_st = 0;

5065

-+	st->wsum -= entity->weight;

5066

-+	if (bfqq != NULL) {

5067

-+		sd = entity->sched_data;

5068

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5069

-+			     bfqq, atomic_read(&bfqq->ref));

5070

-+		bfq_put_queue(bfqq);

5071

-+	}

5072

-+}

5073

-+

5074

-+/**

5075

-+ * bfq_put_idle_entity - release the idle tree ref of an entity.

5076

-+ * @st: service tree for the entity.

5077

-+ * @entity: the entity being released.

5078

-+ */

5079

-+static void bfq_put_idle_entity(struct bfq_service_tree *st,

5080

-+				struct bfq_entity *entity)

5081

-+{

5082

-+	bfq_idle_extract(st, entity);

5083

-+	bfq_forget_entity(st, entity);

5084

-+}

5085

-+

5086

-+/**

5087

-+ * bfq_forget_idle - update the idle tree if necessary.

5088

-+ * @st: the service tree to act upon.

5089

-+ *

5090

-+ * To preserve the global O(log N) complexity we only remove one entry here;

5091

-+ * as the idle tree will not grow indefinitely this can be done safely.

5092

-+ */

5093

-+static void bfq_forget_idle(struct bfq_service_tree *st)

5094

-+{

5095

-+	struct bfq_entity *first_idle = st->first_idle;

5096

-+	struct bfq_entity *last_idle = st->last_idle;

5097

-+

5098

-+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5099

-+	    !bfq_gt(last_idle->finish, st->vtime)) {

5100

-+		/*

5101

-+		 * Forget the whole idle tree, increasing the vtime past

5102

-+		 * the last finish time of idle entities.

5103

-+		 */

5104

-+		st->vtime = last_idle->finish;

5105

-+	}

5106

-+

5107

-+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5108

-+		bfq_put_idle_entity(st, first_idle);

5109

-+}

5110

-+

5111

-+static struct bfq_service_tree *

5112

-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5113

-+			 struct bfq_entity *entity)

5114

-+{

5115

-+	struct bfq_service_tree *new_st = old_st;

5116

-+

5117

-+	if (entity->ioprio_changed) {

5118

-+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5119

-+

5120

-+		BUG_ON(old_st->wsum < entity->weight);

5121

-+		old_st->wsum -= entity->weight;

5122

-+

5123

-+		if (entity->new_weight != entity->orig_weight) {

5124

-+			entity->orig_weight = entity->new_weight;

5125

-+			entity->ioprio =

5126

-+				bfq_weight_to_ioprio(entity->orig_weight);

5127

-+		} else if (entity->new_ioprio != entity->ioprio) {

5128

-+			entity->ioprio = entity->new_ioprio;

5129

-+			entity->orig_weight =

5130

-+					bfq_ioprio_to_weight(entity->ioprio);

5131

-+		} else

5132

-+			entity->new_weight = entity->orig_weight =

5133

-+				bfq_ioprio_to_weight(entity->ioprio);

5134

-+

5135

-+		entity->ioprio_class = entity->new_ioprio_class;

5136

-+		entity->ioprio_changed = 0;

5137

-+

5138

-+		/*

5139

-+		 * NOTE: here we may be changing the weight too early,

5140

-+		 * this will cause unfairness.  The correct approach

5141

-+		 * would have required additional complexity to defer

5142

-+		 * weight changes to the proper time instants (i.e.,

5143

-+		 * when entity->finish <= old_st->vtime).

5144

-+		 */

5145

-+		new_st = bfq_entity_service_tree(entity);

5146

-+		entity->weight = entity->orig_weight *

5147

-+			(bfqq != NULL ? bfqq->raising_coeff : 1);

5148

-+		new_st->wsum += entity->weight;

5149

-+

5150

-+		if (new_st != old_st)

5151

-+			entity->start = new_st->vtime;

5152

-+	}

5153

-+

5154

-+	return new_st;

5155

-+}

5156

-+

5157

-+/**

5158

-+ * bfq_bfqq_served - update the scheduler status after selection for service.

5159

-+ * @bfqq: the queue being served.

5160

-+ * @served: bytes to transfer.

5161

-+ *

5162

-+ * NOTE: this can be optimized, as the timestamps of upper level entities

5163

-+ * are synchronized every time a new bfqq is selected for service.  By now,

5164

-+ * we keep it to better check consistency.

5165

-+ */

5166

-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5167

-+{

5168

-+	struct bfq_entity *entity = &bfqq->entity;

5169

-+	struct bfq_service_tree *st;

5170

-+

5171

-+	for_each_entity(entity) {

5172

-+		st = bfq_entity_service_tree(entity);

5173

-+

5174

-+		entity->service += served;

5175

-+		BUG_ON(entity->service > entity->budget);

5176

-+		BUG_ON(st->wsum == 0);

5177

-+

5178

-+		st->vtime += bfq_delta(served, st->wsum);

5179

-+		bfq_forget_idle(st);

5180

-+	}

5181

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5182

-+}

5183

-+

5184

-+/**

5185

-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5186

-+ * @bfqq: the queue that needs a service update.

5187

-+ *

5188

-+ * When it's not possible to be fair in the service domain, because

5189

-+ * a queue is not consuming its budget fast enough (the meaning of

5190

-+ * fast depends on the timeout parameter), we charge it a full

5191

-+ * budget.  In this way we should obtain a sort of time-domain

5192

-+ * fairness among all the seeky/slow queues.

5193

-+ */

5194

-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5195

-+{

5196

-+	struct bfq_entity *entity = &bfqq->entity;

5197

-+

5198

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5199

-+

5200

-+	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5201

-+}

5202

-+

5203

-+/**

5204

-+ * __bfq_activate_entity - activate an entity.

5205

-+ * @entity: the entity being activated.

5206

-+ *

5207

-+ * Called whenever an entity is activated, i.e., it is not active and one

5208

-+ * of its children receives a new request, or has to be reactivated due to

5209

-+ * budget exhaustion.  It uses the current budget of the entity (and the

5210

-+ * service received if @entity is active) of the queue to calculate its

5211

-+ * timestamps.

5212

-+ */

5213

-+static void __bfq_activate_entity(struct bfq_entity *entity)

5214

-+{

5215

-+	struct bfq_sched_data *sd = entity->sched_data;

5216

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5217

-+

5218

-+	if (entity == sd->active_entity) {

5219

-+		BUG_ON(entity->tree != NULL);

5220

-+		/*

5221

-+		 * If we are requeueing the current entity we have

5222

-+		 * to take care of not charging to it service it has

5223

-+		 * not received.

5224

-+		 */

5225

-+		bfq_calc_finish(entity, entity->service);

5226

-+		entity->start = entity->finish;

5227

-+		sd->active_entity = NULL;

5228

-+	} else if (entity->tree == &st->active) {

5229

-+		/*

5230

-+		 * Requeueing an entity due to a change of some

5231

-+		 * next_active entity below it.  We reuse the old

5232

-+		 * start time.

5233

-+		 */

5234

-+		bfq_active_extract(st, entity);

5235

-+	} else if (entity->tree == &st->idle) {

5236

-+		/*

5237

-+		 * Must be on the idle tree, bfq_idle_extract() will

5238

-+		 * check for that.

5239

-+		 */

5240

-+		bfq_idle_extract(st, entity);

5241

-+		entity->start = bfq_gt(st->vtime, entity->finish) ?

5242

-+				       st->vtime : entity->finish;

5243

-+	} else {

5244

-+		/*

5245

-+		 * The finish time of the entity may be invalid, and

5246

-+		 * it is in the past for sure, otherwise the queue

5247

-+		 * would have been on the idle tree.

5248

-+		 */

5249

-+		entity->start = st->vtime;

5250

-+		st->wsum += entity->weight;

5251

-+		bfq_get_entity(entity);

5252

-+

5253

-+		BUG_ON(entity->on_st);

5254

-+		entity->on_st = 1;

5255

-+	}

5256

-+

5257

-+	st = __bfq_entity_update_weight_prio(st, entity);

5258

-+	bfq_calc_finish(entity, entity->budget);

5259

-+	bfq_active_insert(st, entity);

5260

-+}

5261

-+

5262

-+/**

5263

-+ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5264

-+ * @entity: the entity to activate.

5265

-+ *

5266

-+ * Activate @entity and all the entities on the path from it to the root.

5267

-+ */

5268

-+static void bfq_activate_entity(struct bfq_entity *entity)

5269

-+{

5270

-+	struct bfq_sched_data *sd;

5271

-+

5272

-+	for_each_entity(entity) {

5273

-+		__bfq_activate_entity(entity);

5274

-+

5275

-+		sd = entity->sched_data;

5276

-+		if (!bfq_update_next_active(sd))

5277

-+			/*

5278

-+			 * No need to propagate the activation to the

5279

-+			 * upper entities, as they will be updated when

5280

-+			 * the active entity is rescheduled.

5281

-+			 */

5282

-+			break;

5283

-+	}

5284

-+}

5285

-+

5286

-+/**

5287

-+ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5288

-+ * @entity: the entity to deactivate.

5289

-+ * @requeue: if false, the entity will not be put into the idle tree.

5290

-+ *

5291

-+ * Deactivate an entity, independently from its previous state.  If the

5292

-+ * entity was not on a service tree just return, otherwise if it is on

5293

-+ * any scheduler tree, extract it from that tree, and if necessary

5294

-+ * and if the caller did not specify @requeue, put it on the idle tree.

5295

-+ *

5296

-+ * Return %1 if the caller should update the entity hierarchy, i.e.,

5297

-+ * if the entity was under service or if it was the next_active for

5298

-+ * its sched_data; return %0 otherwise.

5299

-+ */

5300

-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5301

-+{

5302

-+	struct bfq_sched_data *sd = entity->sched_data;

5303

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5304

-+	int was_active = entity == sd->active_entity;

5305

-+	int ret = 0;

5306

-+

5307

-+	if (!entity->on_st)

5308

-+		return 0;

5309

-+

5310

-+	BUG_ON(was_active && entity->tree != NULL);

5311

-+

5312

-+	if (was_active) {

5313

-+		bfq_calc_finish(entity, entity->service);

5314

-+		sd->active_entity = NULL;

5315

-+	} else if (entity->tree == &st->active)

5316

-+		bfq_active_extract(st, entity);

5317

-+	else if (entity->tree == &st->idle)

5318

-+		bfq_idle_extract(st, entity);

5319

-+	else if (entity->tree != NULL)

5320

-+		BUG();

5321

-+

5322

-+	if (was_active || sd->next_active == entity)

5323

-+		ret = bfq_update_next_active(sd);

5324

-+

5325

-+	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5326

-+		bfq_forget_entity(st, entity);

5327

-+	else

5328

-+		bfq_idle_insert(st, entity);

5329

-+

5330

-+	BUG_ON(sd->active_entity == entity);

5331

-+	BUG_ON(sd->next_active == entity);

5332

-+

5333

-+	return ret;

5334

-+}

5335

-+

5336

-+/**

5337

-+ * bfq_deactivate_entity - deactivate an entity.

5338

-+ * @entity: the entity to deactivate.

5339

-+ * @requeue: true if the entity can be put on the idle tree

5340

-+ */

5341

-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5342

-+{

5343

-+	struct bfq_sched_data *sd;

5344

-+	struct bfq_entity *parent;

5345

-+

5346

-+	for_each_entity_safe(entity, parent) {

5347

-+		sd = entity->sched_data;

5348

-+

5349

-+		if (!__bfq_deactivate_entity(entity, requeue))

5350

-+			/*

5351

-+			 * The parent entity is still backlogged, and

5352

-+			 * we don't need to update it as it is still

5353

-+			 * under service.

5354

-+			 */

5355

-+			break;

5356

-+

5357

-+		if (sd->next_active != NULL)

5358

-+			/*

5359

-+			 * The parent entity is still backlogged and

5360

-+			 * the budgets on the path towards the root

5361

-+			 * need to be updated.

5362

-+			 */

5363

-+			goto update;

5364

-+

5365

-+		/*

5366

-+		 * If we reach there the parent is no more backlogged and

5367

-+		 * we want to propagate the dequeue upwards.

5368

-+		 */

5369

-+		requeue = 1;

5370

-+	}

5371

-+

5372

-+	return;

5373

-+

5374

-+update:

5375

-+	entity = parent;

5376

-+	for_each_entity(entity) {

5377

-+		__bfq_activate_entity(entity);

5378

-+

5379

-+		sd = entity->sched_data;

5380

-+		if (!bfq_update_next_active(sd))

5381

-+			break;

5382

-+	}

5383

-+}

5384

-+

5385

-+/**

5386

-+ * bfq_update_vtime - update vtime if necessary.

5387

-+ * @st: the service tree to act upon.

5388

-+ *

5389

-+ * If necessary update the service tree vtime to have at least one

5390

-+ * eligible entity, skipping to its start time.  Assumes that the

5391

-+ * active tree of the device is not empty.

5392

-+ *

5393

-+ * NOTE: this hierarchical implementation updates vtimes quite often,

5394

-+ * we may end up with reactivated tasks getting timestamps after a

5395

-+ * vtime skip done because we needed a ->first_active entity on some

5396

-+ * intermediate node.

5397

-+ */

5398

-+static void bfq_update_vtime(struct bfq_service_tree *st)

5399

-+{

5400

-+	struct bfq_entity *entry;

5401

-+	struct rb_node *node = st->active.rb_node;

5402

-+

5403

-+	entry = rb_entry(node, struct bfq_entity, rb_node);

5404

-+	if (bfq_gt(entry->min_start, st->vtime)) {

5405

-+		st->vtime = entry->min_start;

5406

-+		bfq_forget_idle(st);

5407

-+	}

5408

-+}

5409

-+

5410

-+/**

5411

-+ * bfq_first_active - find the eligible entity with the smallest finish time

5412

-+ * @st: the service tree to select from.

5413

-+ *

5414

-+ * This function searches the first schedulable entity, starting from the

5415

-+ * root of the tree and going on the left every time on this side there is

5416

-+ * a subtree with at least one eligible (start >= vtime) entity.  The path

5417

-+ * on the right is followed only if a) the left subtree contains no eligible

5418

-+ * entities and b) no eligible entity has been found yet.

5419

-+ */

5420

-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

5421

-+{

5422

-+	struct bfq_entity *entry, *first = NULL;

5423

-+	struct rb_node *node = st->active.rb_node;

5424

-+

5425

-+	while (node != NULL) {

5426

-+		entry = rb_entry(node, struct bfq_entity, rb_node);

5427

-+left:

5428

-+		if (!bfq_gt(entry->start, st->vtime))

5429

-+			first = entry;

5430

-+

5431

-+		BUG_ON(bfq_gt(entry->min_start, st->vtime));

5432

-+

5433

-+		if (node->rb_left != NULL) {

5434

-+			entry = rb_entry(node->rb_left,

5435

-+					 struct bfq_entity, rb_node);

5436

-+			if (!bfq_gt(entry->min_start, st->vtime)) {

5437

-+				node = node->rb_left;

5438

-+				goto left;

5439

-+			}

5440

-+		}

5441

-+		if (first != NULL)

5442

-+			break;

5443

-+		node = node->rb_right;

5444

-+	}

5445

-+

5446

-+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

5447

-+	return first;

5448

-+}

5449

-+

5450

-+/**

5451

-+ * __bfq_lookup_next_entity - return the first eligible entity in @st.

5452

-+ * @st: the service tree.

5453

-+ *

5454

-+ * Update the virtual time in @st and return the first eligible entity

5455

-+ * it contains.

5456

-+ */

5457

-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

5458

-+						   bool force)

5459

-+{

5460

-+	struct bfq_entity *entity, *new_next_active = NULL;

5461

-+

5462

-+	if (RB_EMPTY_ROOT(&st->active))

5463

-+		return NULL;

5464

-+

5465

-+	bfq_update_vtime(st);

5466

-+	entity = bfq_first_active_entity(st);

5467

-+	BUG_ON(bfq_gt(entity->start, st->vtime));

5468

-+

5469

-+	/*

5470

-+	 * If the chosen entity does not match with the sched_data's

5471

-+	 * next_active and we are forcedly serving the IDLE priority

5472

-+	 * class tree, bubble up budget update.

5473

-+	 */

5474

-+	if (unlikely(force && entity != entity->sched_data->next_active)) {

5475

-+		new_next_active = entity;

5476

-+		for_each_entity(new_next_active)

5477

-+			bfq_update_budget(new_next_active);

5478

-+	}

5479

-+

5480

-+	return entity;

5481

-+}

5482

-+

5483

-+/**

5484

-+ * bfq_lookup_next_entity - return the first eligible entity in @sd.

5485

-+ * @sd: the sched_data.

5486

-+ * @extract: if true the returned entity will be also extracted from @sd.

5487

-+ *

5488

-+ * NOTE: since we cache the next_active entity at each level of the

5489

-+ * hierarchy, the complexity of the lookup can be decreased with

5490

-+ * absolutely no effort just returning the cached next_active value;

5491

-+ * we prefer to do full lookups to test the consistency of * the data

5492

-+ * structures.

5493

-+ */

5494

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5495

-+						 int extract,

5496

-+						 struct bfq_data *bfqd)

5497

-+{

5498

-+	struct bfq_service_tree *st = sd->service_tree;

5499

-+	struct bfq_entity *entity;

5500

-+	int i = 0;

5501

-+

5502

-+	BUG_ON(sd->active_entity != NULL);

5503

-+

5504

-+	if (bfqd != NULL &&

5505

-+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

5506

-+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

5507

-+						  true);

5508

-+		if (entity != NULL) {

5509

-+			i = BFQ_IOPRIO_CLASSES - 1;

5510

-+			bfqd->bfq_class_idle_last_service = jiffies;

5511

-+			sd->next_active = entity;

5512

-+		}

5513

-+	}

5514

-+	for (; i < BFQ_IOPRIO_CLASSES; i++) {

5515

-+		entity = __bfq_lookup_next_entity(st + i, false);

5516

-+		if (entity != NULL) {

5517

-+			if (extract) {

5518

-+				bfq_check_next_active(sd, entity);

5519

-+				bfq_active_extract(st + i, entity);

5520

-+				sd->active_entity = entity;

5521

-+				sd->next_active = NULL;

5522

-+			}

5523

-+			break;

5524

-+		}

5525

-+	}

5526

-+

5527

-+	return entity;

5528

-+}

5529

-+

5530

-+/*

5531

-+ * Get next queue for service.

5532

-+ */

5533

-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

5534

-+{

5535

-+	struct bfq_entity *entity = NULL;

5536

-+	struct bfq_sched_data *sd;

5537

-+	struct bfq_queue *bfqq;

5538

-+

5539

-+	BUG_ON(bfqd->in_service_queue != NULL);

5540

-+

5541

-+	if (bfqd->busy_queues == 0)

5542

-+		return NULL;

5543

-+

5544

-+	sd = &bfqd->root_group->sched_data;

5545

-+	for (; sd != NULL; sd = entity->my_sched_data) {

5546

-+		entity = bfq_lookup_next_entity(sd, 1, bfqd);

5547

-+		BUG_ON(entity == NULL);

5548

-+		entity->service = 0;

5549

-+	}

5550

-+

5551

-+	bfqq = bfq_entity_to_bfqq(entity);

5552

-+	BUG_ON(bfqq == NULL);

5553

-+

5554

-+	return bfqq;

5555

-+}

5556

-+

5557

-+/*

5558

-+ * Forced extraction of the given queue.

5559

-+ */

5560

-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

5561

-+				      struct bfq_queue *bfqq)

5562

-+{

5563

-+	struct bfq_entity *entity;

5564

-+	struct bfq_sched_data *sd;

5565

-+

5566

-+	BUG_ON(bfqd->in_service_queue != NULL);

5567

-+

5568

-+	entity = &bfqq->entity;

5569

-+	/*

5570

-+	 * Bubble up extraction/update from the leaf to the root.

5571

-+	*/

5572

-+	for_each_entity(entity) {

5573

-+		sd = entity->sched_data;

5574

-+		bfq_update_budget(entity);

5575

-+		bfq_update_vtime(bfq_entity_service_tree(entity));

5576

-+		bfq_active_extract(bfq_entity_service_tree(entity), entity);

5577

-+		sd->active_entity = entity;

5578

-+		sd->next_active = NULL;

5579

-+		entity->service = 0;

5580

-+	}

5581

-+

5582

-+	return;

5583

-+}

5584

-+

5585

-+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

5586

-+{

5587

-+	if (bfqd->in_service_bic != NULL) {

5588

-+		put_io_context(bfqd->in_service_bic->icq.ioc);

5589

-+		bfqd->in_service_bic = NULL;

5590

-+	}

5591

-+

5592

-+	bfqd->in_service_queue = NULL;

5593

-+	del_timer(&bfqd->idle_slice_timer);

5594

-+}

5595

-+

5596

-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5597

-+				int requeue)

5598

-+{

5599

-+	struct bfq_entity *entity = &bfqq->entity;

5600

-+

5601

-+	if (bfqq == bfqd->in_service_queue)

5602

-+		__bfq_bfqd_reset_in_service(bfqd);

5603

-+

5604

-+	bfq_deactivate_entity(entity, requeue);

5605

-+}

5606

-+

5607

-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5608

-+{

5609

-+	struct bfq_entity *entity = &bfqq->entity;

5610

-+

5611

-+	bfq_activate_entity(entity);

5612

-+}

5613

-+

5614

-+/*

5615

-+ * Called when the bfqq no longer has requests pending, remove it from

5616

-+ * the service tree.

5617

-+ */

5618

-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5619

-+			      int requeue)

5620

-+{

5621

-+	BUG_ON(!bfq_bfqq_busy(bfqq));

5622

-+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

5623

-+

5624

-+	bfq_log_bfqq(bfqd, bfqq, "del from busy");

5625

-+

5626

-+	bfq_clear_bfqq_busy(bfqq);

5627

-+

5628

-+	BUG_ON(bfqd->busy_queues == 0);

5629

-+	bfqd->busy_queues--;

5630

-+	if (bfqq->raising_coeff > 1)

5631

-+		bfqd->raised_busy_queues--;

5632

-+

5633

-+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

5634

-+}

5635

-+

5636

-+/*

5637

-+ * Called when an inactive queue receives a new request.

5638

-+ */

5639

-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5640

-+{

5641

-+	BUG_ON(bfq_bfqq_busy(bfqq));

5642

-+	BUG_ON(bfqq == bfqd->in_service_queue);

5643

-+

5644

-+	bfq_log_bfqq(bfqd, bfqq, "add to busy");

5645

-+

5646

-+	bfq_activate_bfqq(bfqd, bfqq);

5647

-+

5648

-+	bfq_mark_bfqq_busy(bfqq);

5649

-+	bfqd->busy_queues++;

5650

-+	if (bfqq->raising_coeff > 1)

5651

-+		bfqd->raised_busy_queues++;

5652

-+}

5653

-diff --git a/block/bfq.h b/block/bfq.h

5654

-new file mode 100644

5655

-index 0000000..68b28e3

5656

---- /dev/null

5657

-+++ b/block/bfq.h

5658

-@@ -0,0 +1,614 @@

5659

-+/*

5660

-+ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.

5661

-+ *

5662

-+ * Based on ideas and code from CFQ:

5663

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5664

-+ *

5665

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5666

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

5667

-+ *

5668

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5669

-+ */

5670

-+

5671

-+#ifndef _BFQ_H

5672

-+#define _BFQ_H

5673

-+

5674

-+#include <linux/blktrace_api.h>

5675

-+#include <linux/hrtimer.h>

5676

-+#include <linux/ioprio.h>

5677

-+#include <linux/rbtree.h>

5678

-+

5679

-+#define BFQ_IOPRIO_CLASSES	3

5680

-+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

5681

-+

5682

-+#define BFQ_MIN_WEIGHT	1

5683

-+#define BFQ_MAX_WEIGHT	1000

5684

-+

5685

-+#define BFQ_DEFAULT_GRP_WEIGHT	10

5686

-+#define BFQ_DEFAULT_GRP_IOPRIO	0

5687

-+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

5688

-+

5689

-+struct bfq_entity;

5690

-+

5691

-+/**

5692

-+ * struct bfq_service_tree - per ioprio_class service tree.

5693

-+ * @active: tree for active entities (i.e., those backlogged).

5694

-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

5695

-+ * @first_idle: idle entity with minimum F_i.

5696

-+ * @last_idle: idle entity with maximum F_i.

5697

-+ * @vtime: scheduler virtual time.

5698

-+ * @wsum: scheduler weight sum; active and idle entities contribute to it.

5699

-+ *

5700

-+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

5701

-+ * ioprio_class has its own independent scheduler, and so its own

5702

-+ * bfq_service_tree.  All the fields are protected by the queue lock

5703

-+ * of the containing bfqd.

5704

-+ */

5705

-+struct bfq_service_tree {

5706

-+	struct rb_root active;

5707

-+	struct rb_root idle;

5708

-+

5709

-+	struct bfq_entity *first_idle;

5710

-+	struct bfq_entity *last_idle;

5711

-+

5712

-+	u64 vtime;

5713

-+	unsigned long wsum;

5714

-+};

5715

-+

5716

-+/**

5717

-+ * struct bfq_sched_data - multi-class scheduler.

5718

-+ * @active_entity: entity under service.

5719

-+ * @next_active: head-of-the-line entity in the scheduler.

5720

-+ * @service_tree: array of service trees, one per ioprio_class.

5721

-+ *

5722

-+ * bfq_sched_data is the basic scheduler queue.  It supports three

5723

-+ * ioprio_classes, and can be used either as a toplevel queue or as

5724

-+ * an intermediate queue on a hierarchical setup.

5725

-+ * @next_active points to the active entity of the sched_data service

5726

-+ * trees that will be scheduled next.

5727

-+ *

5728

-+ * The supported ioprio_classes are the same as in CFQ, in descending

5729

-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

5730

-+ * Requests from higher priority queues are served before all the

5731

-+ * requests from lower priority queues; among requests of the same

5732

-+ * queue requests are served according to B-WF2Q+.

5733

-+ * All the fields are protected by the queue lock of the containing bfqd.

5734

-+ */

5735

-+struct bfq_sched_data {

5736

-+	struct bfq_entity *active_entity;

5737

-+	struct bfq_entity *next_active;

5738

-+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

5739

-+};

5740

-+

5741

-+/**

5742

-+ * struct bfq_entity - schedulable entity.

5743

-+ * @rb_node: service_tree member.

5744

-+ * @on_st: flag, true if the entity is on a tree (either the active or

5745

-+ *         the idle one of its service_tree).

5746

-+ * @finish: B-WF2Q+ finish timestamp (aka F_i).

5747

-+ * @start: B-WF2Q+ start timestamp (aka S_i).

5748

-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

5749

-+ * @min_start: minimum start time of the (active) subtree rooted at

5750

-+ *             this entity; used for O(log N) lookups into active trees.

5751

-+ * @service: service received during the last round of service.

5752

-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

5753

-+ * @weight: weight of the queue

5754

-+ * @parent: parent entity, for hierarchical scheduling.

5755

-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

5756

-+ *                 associated scheduler queue, %NULL on leaf nodes.

5757

-+ * @sched_data: the scheduler queue this entity belongs to.

5758

-+ * @ioprio: the ioprio in use.

5759

-+ * @new_weight: when a weight change is requested, the new weight value.

5760

-+ * @orig_weight: original weight, used to implement weight boosting

5761

-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

5762

-+ * @ioprio_class: the ioprio_class in use.

5763

-+ * @new_ioprio_class: when an ioprio_class change is requested, the new

5764

-+ *                    ioprio_class value.

5765

-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

5766

-+ *                  ioprio_class change.

5767

-+ *

5768

-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

5769

-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

5770

-+ * entity belongs to the sched_data of the parent group in the cgroup

5771

-+ * hierarchy.  Non-leaf entities have also their own sched_data, stored

5772

-+ * in @my_sched_data.

5773

-+ *

5774

-+ * Each entity stores independently its priority values; this would

5775

-+ * allow different weights on different devices, but this

5776

-+ * functionality is not exported to userspace by now.  Priorities and

5777

-+ * weights are updated lazily, first storing the new values into the

5778

-+ * new_* fields, then setting the @ioprio_changed flag.  As soon as

5779

-+ * there is a transition in the entity state that allows the priority

5780

-+ * update to take place the effective and the requested priority

5781

-+ * values are synchronized.

5782

-+ *

5783

-+ * Unless cgroups are used, the weight value is calculated from the

5784

-+ * ioprio to export the same interface as CFQ.  When dealing with

5785

-+ * ``well-behaved'' queues (i.e., queues that do not spend too much

5786

-+ * time to consume their budget and have true sequential behavior, and

5787

-+ * when there are no external factors breaking anticipation) the

5788

-+ * relative weights at each level of the cgroups hierarchy should be

5789

-+ * guaranteed.  All the fields are protected by the queue lock of the

5790

-+ * containing bfqd.

5791

-+ */

5792

-+struct bfq_entity {

5793

-+	struct rb_node rb_node;

5794

-+

5795

-+	int on_st;

5796

-+

5797

-+	u64 finish;

5798

-+	u64 start;

5799

-+

5800

-+	struct rb_root *tree;

5801

-+

5802

-+	u64 min_start;

5803

-+

5804

-+	unsigned long service, budget;

5805

-+	unsigned short weight, new_weight;

5806

-+	unsigned short orig_weight;

5807

-+

5808

-+	struct bfq_entity *parent;

5809

-+

5810

-+	struct bfq_sched_data *my_sched_data;

5811

-+	struct bfq_sched_data *sched_data;

5812

-+

5813

-+	unsigned short ioprio, new_ioprio;

5814

-+	unsigned short ioprio_class, new_ioprio_class;

5815

-+

5816

-+	int ioprio_changed;

5817

-+};

5818

-+

5819

-+struct bfq_group;

5820

-+

5821

-+/**

5822

-+ * struct bfq_queue - leaf schedulable entity.

5823

-+ * @ref: reference counter.

5824

-+ * @bfqd: parent bfq_data.

5825

-+ * @new_bfqq: shared bfq_queue if queue is cooperating with

5826

-+ *           one or more other queues.

5827

-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

5828

-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

5829

-+ * @sort_list: sorted list of pending requests.

5830

-+ * @next_rq: if fifo isn't expired, next request to serve.

5831

-+ * @queued: nr of requests queued in @sort_list.

5832

-+ * @allocated: currently allocated requests.

5833

-+ * @meta_pending: pending metadata requests.

5834

-+ * @fifo: fifo list of requests in sort_list.

5835

-+ * @entity: entity representing this queue in the scheduler.

5836

-+ * @max_budget: maximum budget allowed from the feedback mechanism.

5837

-+ * @budget_timeout: budget expiration (in jiffies).

5838

-+ * @dispatched: number of requests on the dispatch list or inside driver.

5839

-+ * @org_ioprio: saved ioprio during boosted periods.

5840

-+ * @flags: status flags.

5841

-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

5842

-+ * @seek_samples: number of seeks sampled

5843

-+ * @seek_total: sum of the distances of the seeks sampled

5844

-+ * @seek_mean: mean seek distance

5845

-+ * @last_request_pos: position of the last request enqueued

5846

-+ * @pid: pid of the process owning the queue, used for logging purposes.

5847

-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

5848

-+ * @raising_cur_max_time: current max raising time for this queue

5849

-+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

5850

-+ *                      idle to backlogged

5851

-+ * @service_from_backlogged: cumulative service received from the @bfq_queue

5852

-+ *                           since the last transition from idle to backlogged

5853

-+ *

5854

-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context

5855

-+ * or more (if it is an async one).  @cgroup holds a reference to the

5856

-+ * cgroup, to be sure that it does not disappear while a bfqq still

5857

-+ * references it (mostly to avoid races between request issuing and task

5858

-+ * migration followed by cgroup distruction).

5859

-+ * All the fields are protected by the queue lock of the containing bfqd.

5860

-+ */

5861

-+struct bfq_queue {

5862

-+	atomic_t ref;

5863

-+	struct bfq_data *bfqd;

5864

-+

5865

-+	/* fields for cooperating queues handling */

5866

-+	struct bfq_queue *new_bfqq;

5867

-+	struct rb_node pos_node;

5868

-+	struct rb_root *pos_root;

5869

-+

5870

-+	struct rb_root sort_list;

5871

-+	struct request *next_rq;

5872

-+	int queued[2];

5873

-+	int allocated[2];

5874

-+	int meta_pending;

5875

-+	struct list_head fifo;

5876

-+

5877

-+	struct bfq_entity entity;

5878

-+

5879

-+	unsigned long max_budget;

5880

-+	unsigned long budget_timeout;

5881

-+

5882

-+	int dispatched;

5883

-+

5884

-+	unsigned short org_ioprio;

5885

-+

5886

-+	unsigned int flags;

5887

-+

5888

-+	struct list_head bfqq_list;

5889

-+

5890

-+	unsigned int seek_samples;

5891

-+	u64 seek_total;

5892

-+	sector_t seek_mean;

5893

-+	sector_t last_request_pos;

5894

-+

5895

-+	pid_t pid;

5896

-+

5897

-+	/* weight-raising fields */

5898

-+	unsigned int raising_cur_max_time;

5899

-+	unsigned long soft_rt_next_start;

5900

-+	u64 last_rais_start_finish;

5901

-+	unsigned int raising_coeff;

5902

-+	u64 last_idle_bklogged;

5903

-+	unsigned long service_from_backlogged;

5904

-+};

5905

-+

5906

-+/**

5907

-+ * struct bfq_ttime - per process thinktime stats.

5908

-+ * @ttime_total: total process thinktime

5909

-+ * @ttime_samples: number of thinktime samples

5910

-+ * @ttime_mean: average process thinktime

5911

-+ */

5912

-+struct bfq_ttime {

5913

-+	unsigned long last_end_request;

5914

-+

5915

-+	unsigned long ttime_total;

5916

-+	unsigned long ttime_samples;

5917

-+	unsigned long ttime_mean;

5918

-+};

5919

-+

5920

-+/**

5921

-+ * struct bfq_io_cq - per (request_queue, io_context) structure.

5922

-+ * @icq: associated io_cq structure

5923

-+ * @bfqq: array of two process queues, the sync and the async

5924

-+ * @ttime: associated @bfq_ttime struct

5925

-+ */

5926

-+struct bfq_io_cq {

5927

-+	struct io_cq icq; /* must be the first member */

5928

-+	struct bfq_queue *bfqq[2];

5929

-+	struct bfq_ttime ttime;

5930

-+	int ioprio;

5931

-+};

5932

-+

5933

-+/**

5934

-+ * struct bfq_data - per device data structure.

5935

-+ * @queue: request queue for the managed device.

5936

-+ * @root_group: root bfq_group for the device.

5937

-+ * @rq_pos_tree: rbtree sorted by next_request position,

5938

-+ *		used when determining if two or more queues

5939

-+ *		have interleaving requests (see bfq_close_cooperator).

5940

-+ * @busy_queues: number of bfq_queues containing requests (including the

5941

-+ *		 queue under service, even if it is idling).

5942

-+ * @raised_busy_queues: number of weight-raised busy bfq_queues.

5943

-+ * @queued: number of queued requests.

5944

-+ * @rq_in_driver: number of requests dispatched and waiting for completion.

5945

-+ * @sync_flight: number of sync requests in the driver.

5946

-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

5947

-+ *		      completed requests .

5948

-+ * @hw_tag_samples: nr of samples used to calculate hw_tag.

5949

-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

5950

-+ * @budgets_assigned: number of budgets assigned.

5951

-+ * @idle_slice_timer: timer set when idling for the next sequential request

5952

-+ *                    from the queue under service.

5953

-+ * @unplug_work: delayed work to restart dispatching on the request queue.

5954

-+ * @in_service_queue: bfq_queue under service.

5955

-+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

5956

-+ * @last_position: on-disk position of the last served request.

5957

-+ * @last_budget_start: beginning of the last budget.

5958

-+ * @last_idling_start: beginning of the last idle slice.

5959

-+ * @peak_rate: peak transfer rate observed for a budget.

5960

-+ * @peak_rate_samples: number of samples used to calculate @peak_rate.

5961

-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

5962

-+ * @group_list: list of all the bfq_groups active on the device.

5963

-+ * @active_list: list of all the bfq_queues active on the device.

5964

-+ * @idle_list: list of all the bfq_queues idle on the device.

5965

-+ * @bfq_quantum: max number of requests dispatched per dispatch round.

5966

-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

5967

-+ *                   requests are served in fifo order.

5968

-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

5969

-+ * @bfq_back_max: maximum allowed backward seek.

5970

-+ * @bfq_slice_idle: maximum idling time.

5971

-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

5972

-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

5973

-+ *                           async queues.

5974

-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

5975

-+ *               to prevent seeky queues to impose long latencies to well

5976

-+ *               behaved ones (this also implies that seeky queues cannot

5977

-+ *               receive guarantees in the service domain; after a timeout

5978

-+ *               they are charged for the whole allocated budget, to try

5979

-+ *               to preserve a behavior reasonably fair among them, but

5980

-+ *               without service-domain guarantees).

5981

-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

5982

-+ *                            queue is multiplied

5983

-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

5984

-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

5985

-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

5986

-+ *			       may be reactivated for a queue (in jiffies)

5987

-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

5988

-+ *				     after which weight-raising may be

5989

-+ *				     reactivated for an already busy queue

5990

-+ *				     (in jiffies)

5991

-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

5992

-+ *			         sectors per seconds

5993

-+ * @RT_prod: cached value of the product R*T used for computing the maximum

5994

-+ *	     duration of the weight raising automatically

5995

-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

5996

-+ *

5997

-+ * All the fields are protected by the @queue lock.

5998

-+ */

5999

-+struct bfq_data {

6000

-+	struct request_queue *queue;

6001

-+

6002

-+	struct bfq_group *root_group;

6003

-+

6004

-+	struct rb_root rq_pos_tree;

6005

-+

6006

-+	int busy_queues;

6007

-+	int raised_busy_queues;

6008

-+	int queued;

6009

-+	int rq_in_driver;

6010

-+	int sync_flight;

6011

-+

6012

-+	int max_rq_in_driver;

6013

-+	int hw_tag_samples;

6014

-+	int hw_tag;

6015

-+

6016

-+	int budgets_assigned;

6017

-+

6018

-+	struct timer_list idle_slice_timer;

6019

-+	struct work_struct unplug_work;

6020

-+

6021

-+	struct bfq_queue *in_service_queue;

6022

-+	struct bfq_io_cq *in_service_bic;

6023

-+

6024

-+	sector_t last_position;

6025

-+

6026

-+	ktime_t last_budget_start;

6027

-+	ktime_t last_idling_start;

6028

-+	int peak_rate_samples;

6029

-+	u64 peak_rate;

6030

-+	unsigned long bfq_max_budget;

6031

-+

6032

-+	struct hlist_head group_list;

6033

-+	struct list_head active_list;

6034

-+	struct list_head idle_list;

6035

-+

6036

-+	unsigned int bfq_quantum;

6037

-+	unsigned int bfq_fifo_expire[2];

6038

-+	unsigned int bfq_back_penalty;

6039

-+	unsigned int bfq_back_max;

6040

-+	unsigned int bfq_slice_idle;

6041

-+	u64 bfq_class_idle_last_service;

6042

-+

6043

-+	unsigned int bfq_user_max_budget;

6044

-+	unsigned int bfq_max_budget_async_rq;

6045

-+	unsigned int bfq_timeout[2];

6046

-+

6047

-+	bool low_latency;

6048

-+

6049

-+	/* parameters of the low_latency heuristics */

6050

-+	unsigned int bfq_raising_coeff;

6051

-+	unsigned int bfq_raising_max_time;

6052

-+	unsigned int bfq_raising_rt_max_time;

6053

-+	unsigned int bfq_raising_min_idle_time;

6054

-+	unsigned long bfq_raising_min_inter_arr_async;

6055

-+	unsigned int bfq_raising_max_softrt_rate;

6056

-+	u64 RT_prod;

6057

-+

6058

-+	struct bfq_queue oom_bfqq;

6059

-+};

6060

-+

6061

-+enum bfqq_state_flags {

6062

-+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

6063

-+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6064

-+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6065

-+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6066

-+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6067

-+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

6068

-+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6069

-+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6070

-+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6071

-+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6072

-+	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

6073

-+};

6074

-+

6075

-+#define BFQ_BFQQ_FNS(name)						\

6076

-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6077

-+{									\

6078

-+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6079

-+}									\

6080

-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6081

-+{									\

6082

-+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6083

-+}									\

6084

-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6085

-+{									\

6086

-+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6087

-+}

6088

-+

6089

-+BFQ_BFQQ_FNS(busy);

6090

-+BFQ_BFQQ_FNS(wait_request);

6091

-+BFQ_BFQQ_FNS(must_alloc);

6092

-+BFQ_BFQQ_FNS(fifo_expire);

6093

-+BFQ_BFQQ_FNS(idle_window);

6094

-+BFQ_BFQQ_FNS(prio_changed);

6095

-+BFQ_BFQQ_FNS(sync);

6096

-+BFQ_BFQQ_FNS(budget_new);

6097

-+BFQ_BFQQ_FNS(coop);

6098

-+BFQ_BFQQ_FNS(split_coop);

6099

-+BFQ_BFQQ_FNS(softrt_update);

6100

-+#undef BFQ_BFQQ_FNS

6101

-+

6102

-+/* Logging facilities. */

6103

-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6104

-+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6105

-+

6106

-+#define bfq_log(bfqd, fmt, args...) \

6107

-+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6108

-+

6109

-+/* Expiration reasons. */

6110

-+enum bfqq_expiration {

6111

-+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

6112

-+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6113

-+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6114

-+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6115

-+};

6116

-+

6117

-+#ifdef CONFIG_CGROUP_BFQIO

6118

-+/**

6119

-+ * struct bfq_group - per (device, cgroup) data structure.

6120

-+ * @entity: schedulable entity to insert into the parent group sched_data.

6121

-+ * @sched_data: own sched_data, to contain child entities (they may be

6122

-+ *              both bfq_queues and bfq_groups).

6123

-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6124

-+ *              list of the containing cgroup's bfqio_cgroup.

6125

-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6126

-+ *             of the groups active on the same device; used for cleanup.

6127

-+ * @bfqd: the bfq_data for the device this group acts upon.

6128

-+ * @async_bfqq: array of async queues for all the tasks belonging to

6129

-+ *              the group, one queue per ioprio value per ioprio_class,

6130

-+ *              except for the idle class that has only one queue.

6131

-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6132

-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6133

-+ *             to avoid too many special cases during group creation/migration.

6134

-+ *

6135

-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6136

-+ * there is a set of bfq_groups, each one collecting the lower-level

6137

-+ * entities belonging to the group that are acting on the same device.

6138

-+ *

6139

-+ * Locking works as follows:

6140

-+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6141

-+ *      via RCU from its readers.

6142

-+ *    o @bfqd is protected by the queue lock, RCU is used to access it

6143

-+ *      from the readers.

6144

-+ *    o All the other fields are protected by the @bfqd queue lock.

6145

-+ */

6146

-+struct bfq_group {

6147

-+	struct bfq_entity entity;

6148

-+	struct bfq_sched_data sched_data;

6149

-+

6150

-+	struct hlist_node group_node;

6151

-+	struct hlist_node bfqd_node;

6152

-+

6153

-+	void *bfqd;

6154

-+

6155

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6156

-+	struct bfq_queue *async_idle_bfqq;

6157

-+

6158

-+	struct bfq_entity *my_entity;

6159

-+};

6160

-+

6161

-+/**

6162

-+ * struct bfqio_cgroup - bfq cgroup data structure.

6163

-+ * @css: subsystem state for bfq in the containing cgroup.

6164

-+ * @online: flag marked when the subsystem is inserted.

6165

-+ * @weight: cgroup weight.

6166

-+ * @ioprio: cgroup ioprio.

6167

-+ * @ioprio_class: cgroup ioprio_class.

6168

-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

6169

-+ * @group_data: list containing the bfq_group belonging to this cgroup.

6170

-+ *

6171

-+ * @group_data is accessed using RCU, with @lock protecting the updates,

6172

-+ * @ioprio and @ioprio_class are protected by @lock.

6173

-+ */

6174

-+struct bfqio_cgroup {

6175

-+	struct cgroup_subsys_state css;

6176

-+	bool online;

6177

-+

6178

-+	unsigned short weight, ioprio, ioprio_class;

6179

-+

6180

-+	spinlock_t lock;

6181

-+	struct hlist_head group_data;

6182

-+};

6183

-+#else

6184

-+struct bfq_group {

6185

-+	struct bfq_sched_data sched_data;

6186

-+

6187

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6188

-+	struct bfq_queue *async_idle_bfqq;

6189

-+};

6190

-+#endif

6191

-+

6192

-+static inline struct bfq_service_tree *

6193

-+bfq_entity_service_tree(struct bfq_entity *entity)

6194

-+{

6195

-+	struct bfq_sched_data *sched_data = entity->sched_data;

6196

-+	unsigned int idx = entity->ioprio_class - 1;

6197

-+

6198

-+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

6199

-+	BUG_ON(sched_data == NULL);

6200

-+

6201

-+	return sched_data->service_tree + idx;

6202

-+}

6203

-+

6204

-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

6205

-+					    int is_sync)

6206

-+{

6207

-+	return bic->bfqq[!!is_sync];

6208

-+}

6209

-+

6210

-+static inline void bic_set_bfqq(struct bfq_io_cq *bic,

6211

-+				struct bfq_queue *bfqq, int is_sync)

6212

-+{

6213

-+	bic->bfqq[!!is_sync] = bfqq;

6214

-+}

6215

-+

6216

-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

6217

-+{

6218

-+	return bic->icq.q->elevator->elevator_data;

6219

-+}

6220

-+

6221

-+/**

6222

-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

6223

-+ * @ptr: a pointer to a bfqd.

6224

-+ * @flags: storage for the flags to be saved.

6225

-+ *

6226

-+ * This function allows bfqg->bfqd to be protected by the

6227

-+ * queue lock of the bfqd they reference; the pointer is dereferenced

6228

-+ * under RCU, so the storage for bfqd is assured to be safe as long

6229

-+ * as the RCU read side critical section does not end.  After the

6230

-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

6231

-+ * sure that no other writer accessed it.  If we raced with a writer,

6232

-+ * the function returns NULL, with the queue unlocked, otherwise it

6233

-+ * returns the dereferenced pointer, with the queue locked.

6234

-+ */

6235

-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

6236

-+						   unsigned long *flags)

6237

-+{

6238

-+	struct bfq_data *bfqd;

6239

-+

6240

-+	rcu_read_lock();

6241

-+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

6242

-+

6243

-+	if (bfqd != NULL) {

6244

-+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

6245

-+		if (*ptr == bfqd)

6246

-+			goto out;

6247

-+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6248

-+	}

6249

-+

6250

-+	bfqd = NULL;

6251

-+out:

6252

-+	rcu_read_unlock();

6253

-+	return bfqd;

6254

-+}

6255

-+

6256

-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

6257

-+				       unsigned long *flags)

6258

-+{

6259

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6260

-+}

6261

-+

6262

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic);

6263

-+static void bfq_put_queue(struct bfq_queue *bfqq);

6264

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

6265

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

6266

-+				       struct bfq_group *bfqg, int is_sync,

6267

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask);

6268

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

6269

-+					 struct bfq_group *bfqg);

6270

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

6271

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

6272

-+#endif

6273

---

6274

-1.8.5.2

6275

-

6276

6277

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1

6278

===================================================================

6279

--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	2014-03-26 23:50:52 UTC (rev 2715)

6280

+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1	2014-03-31 12:03:14 UTC (rev 2716)

6281

@@ -1,6040 +0,0 @@

6282

-From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001

6283

-From: Paolo Valente <paolo.valente@×××××××.it>

6284

-Date: Thu, 9 May 2013 19:10:02 +0200

6285

-Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13

6286

-

6287

-Add the BFQ-v7r1 I/O scheduler to 3.13.

6288

-The general structure is borrowed from CFQ, as much of the code for

6289

-handling I/O contexts Over time, several useful features have been

6290

-ported from CFQ as well (details in the changelog in README.BFQ). A

6291

-(bfq_)queue is associated to each task doing I/O on a device, and each

6292

-time a scheduling decision has to be made a queue is selected and served

6293

-until it expires.

6294

-

6295

-    - Slices are given in the service domain: tasks are assigned

6296

-      budgets, measured in number of sectors. Once got the disk, a task

6297

-      must however consume its assigned budget within a configurable

6298

-      maximum time (by default, the maximum possible value of the

6299

-      budgets is automatically computed to comply with this timeout).

6300

-      This allows the desired latency vs "throughput boosting" tradeoff

6301

-      to be set.

6302

-

6303

-    - Budgets are scheduled according to a variant of WF2Q+, implemented

6304

-      using an augmented rb-tree to take eligibility into account while

6305

-      preserving an O(log N) overall complexity.

6306

-

6307

-    - A low-latency tunable is provided; if enabled, both interactive

6308

-      and soft real-time applications are guaranteed a very low latency.

6309

-

6310

-    - Latency guarantees are preserved also in the presence of NCQ.

6311

-

6312

-    - Also with flash-based devices, a high throughput is achieved

6313

-      while still preserving latency guarantees.

6314

-

6315

-    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

6316

-      cooperating-queue-merging and the preemption mechanisms present

6317

-      in CFQ. EQM is in fact a unified mechanism that tries to get a

6318

-      sequential read pattern, and hence a high throughput, with any

6319

-      set of processes performing interleaved I/O over a contiguous

6320

-      sequence of sectors.

6321

-

6322

-    - BFQ supports full hierarchical scheduling, exporting a cgroups

6323

-      interface.  Since each node has a full scheduler, each group can

6324

-      be assigned its own weight.

6325

-

6326

-    - If the cgroups interface is not used, only I/O priorities can be

6327

-      assigned to processes, with ioprio values mapped to weights

6328

-      with the relation weight = IOPRIO_BE_NR - ioprio.

6329

-

6330

-    - ioprio classes are served in strict priority order, i.e., lower

6331

-      priority queues are not served as long as there are higher

6332

-      priority queues.  Among queues in the same class the bandwidth is

6333

-      distributed in proportion to the weight of each queue. A very

6334

-      thin extra bandwidth is however guaranteed to the Idle class, to

6335

-      prevent it from starving.

6336

-

6337

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

6338

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

6339

----

6340

- block/bfq-cgroup.c  |  911 ++++++++++++++

6341

- block/bfq-ioc.c     |   36 +

6342

- block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++

6343

- block/bfq-sched.c   | 1078 +++++++++++++++++

6344

- block/bfq.h         |  614 ++++++++++

6345

- 5 files changed, 5937 insertions(+)

6346

- create mode 100644 block/bfq-cgroup.c

6347

- create mode 100644 block/bfq-ioc.c

6348

- create mode 100644 block/bfq-iosched.c

6349

- create mode 100644 block/bfq-sched.c

6350

- create mode 100644 block/bfq.h

6351

-

6352

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

6353

-new file mode 100644

6354

-index 0000000..79a288a

6355

---- /dev/null

6356

-+++ b/block/bfq-cgroup.c

6357

-@@ -0,0 +1,911 @@

6358

-+/*

6359

-+ * BFQ: CGROUPS support.

6360

-+ *

6361

-+ * Based on ideas and code from CFQ:

6362

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6363

-+ *

6364

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6365

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

6366

-+ *

6367

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6368

-+ *

6369

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

6370

-+ */

6371

-+

6372

-+#ifdef CONFIG_CGROUP_BFQIO

6373

-+

6374

-+static DEFINE_MUTEX(bfqio_mutex);

6375

-+

6376

-+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

6377

-+{

6378

-+	return bgrp ? !bgrp->online : false;

6379

-+}

6380

-+

6381

-+static struct bfqio_cgroup bfqio_root_cgroup = {

6382

-+	.weight = BFQ_DEFAULT_GRP_WEIGHT,

6383

-+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

6384

-+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

6385

-+};

6386

-+

6387

-+static inline void bfq_init_entity(struct bfq_entity *entity,

6388

-+				   struct bfq_group *bfqg)

6389

-+{

6390

-+	entity->weight = entity->new_weight;

6391

-+	entity->orig_weight = entity->new_weight;

6392

-+	entity->ioprio = entity->new_ioprio;

6393

-+	entity->ioprio_class = entity->new_ioprio_class;

6394

-+	entity->parent = bfqg->my_entity;

6395

-+	entity->sched_data = &bfqg->sched_data;

6396

-+}

6397

-+

6398

-+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

6399

-+{

6400

-+	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

6401

-+}

6402

-+

6403

-+/*

6404

-+ * Search the bfq_group for bfqd into the hash table (by now only a list)

6405

-+ * of bgrp.  Must be called under rcu_read_lock().

6406

-+ */

6407

-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

6408

-+					    struct bfq_data *bfqd)

6409

-+{

6410

-+	struct bfq_group *bfqg;

6411

-+	void *key;

6412

-+

6413

-+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

6414

-+		key = rcu_dereference(bfqg->bfqd);

6415

-+		if (key == bfqd)

6416

-+			return bfqg;

6417

-+	}

6418

-+

6419

-+	return NULL;

6420

-+}

6421

-+

6422

-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

6423

-+					 struct bfq_group *bfqg)

6424

-+{

6425

-+	struct bfq_entity *entity = &bfqg->entity;

6426

-+

6427

-+	/*

6428

-+	 * If the weight of the entity has never been set via the sysfs

6429

-+	 * interface, then bgrp->weight == 0. In this case we initialize

6430

-+	 * the weight from the current ioprio value. Otherwise, the group

6431

-+	 * weight, if set, has priority over the ioprio value.

6432

-+	 */

6433

-+	if (bgrp->weight == 0) {

6434

-+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

6435

-+		entity->new_ioprio = bgrp->ioprio;

6436

-+	} else {

6437

-+		entity->new_weight = bgrp->weight;

6438

-+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

6439

-+	}

6440

-+	entity->orig_weight = entity->weight = entity->new_weight;

6441

-+	entity->ioprio = entity->new_ioprio;

6442

-+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

6443

-+	entity->my_sched_data = &bfqg->sched_data;

6444

-+}

6445

-+

6446

-+static inline void bfq_group_set_parent(struct bfq_group *bfqg,

6447

-+					struct bfq_group *parent)

6448

-+{

6449

-+	struct bfq_entity *entity;

6450

-+

6451

-+	BUG_ON(parent == NULL);

6452

-+	BUG_ON(bfqg == NULL);

6453

-+

6454

-+	entity = &bfqg->entity;

6455

-+	entity->parent = parent->my_entity;

6456

-+	entity->sched_data = &parent->sched_data;

6457

-+}

6458

-+

6459

-+/**

6460

-+ * bfq_group_chain_alloc - allocate a chain of groups.

6461

-+ * @bfqd: queue descriptor.

6462

-+ * @css: the leaf cgroup_subsys_state this chain starts from.

6463

-+ *

6464

-+ * Allocate a chain of groups starting from the one belonging to

6465

-+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

6466

-+ * to the root has already an allocated group on @bfqd.

6467

-+ */

6468

-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

6469

-+					       struct cgroup_subsys_state *css)

6470

-+{

6471

-+	struct bfqio_cgroup *bgrp;

6472

-+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

6473

-+

6474

-+	for (; css != NULL; css = css->parent) {

6475

-+		bgrp = css_to_bfqio(css);

6476

-+

6477

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6478

-+		if (bfqg != NULL) {

6479

-+			/*

6480

-+			 * All the cgroups in the path from there to the

6481

-+			 * root must have a bfq_group for bfqd, so we don't

6482

-+			 * need any more allocations.

6483

-+			 */

6484

-+			break;

6485

-+		}

6486

-+

6487

-+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

6488

-+		if (bfqg == NULL)

6489

-+			goto cleanup;

6490

-+

6491

-+		bfq_group_init_entity(bgrp, bfqg);

6492

-+		bfqg->my_entity = &bfqg->entity;

6493

-+

6494

-+		if (leaf == NULL) {

6495

-+			leaf = bfqg;

6496

-+			prev = leaf;

6497

-+		} else {

6498

-+			bfq_group_set_parent(prev, bfqg);

6499

-+			/*

6500

-+			 * Build a list of allocated nodes using the bfqd

6501

-+			 * filed, that is still unused and will be initialized

6502

-+			 * only after the node will be connected.

6503

-+			 */

6504

-+			prev->bfqd = bfqg;

6505

-+			prev = bfqg;

6506

-+		}

6507

-+	}

6508

-+

6509

-+	return leaf;

6510

-+

6511

-+cleanup:

6512

-+	while (leaf != NULL) {

6513

-+		prev = leaf;

6514

-+		leaf = leaf->bfqd;

6515

-+		kfree(prev);

6516

-+	}

6517

-+

6518

-+	return NULL;

6519

-+}

6520

-+

6521

-+/**

6522

-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

6523

-+ * @bfqd: the queue descriptor.

6524

-+ * @css: the leaf cgroup_subsys_state to start from.

6525

-+ * @leaf: the leaf group (to be associated to @cgroup).

6526

-+ *

6527

-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the

6528

-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the

6529

-+ * hierarchy that already as a group associated to @bfqd all the nodes

6530

-+ * in the path to the root cgroup have one too.

6531

-+ *

6532

-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy

6533

-+ * per device) while the bfqio_cgroup lock protects the list of groups

6534

-+ * belonging to the same cgroup.

6535

-+ */

6536

-+static void bfq_group_chain_link(struct bfq_data *bfqd,

6537

-+				 struct cgroup_subsys_state *css,

6538

-+				 struct bfq_group *leaf)

6539

-+{

6540

-+	struct bfqio_cgroup *bgrp;

6541

-+	struct bfq_group *bfqg, *next, *prev = NULL;

6542

-+	unsigned long flags;

6543

-+

6544

-+	assert_spin_locked(bfqd->queue->queue_lock);

6545

-+

6546

-+	for (; css != NULL && leaf != NULL; css = css->parent) {

6547

-+		bgrp = css_to_bfqio(css);

6548

-+		next = leaf->bfqd;

6549

-+

6550

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6551

-+		BUG_ON(bfqg != NULL);

6552

-+

6553

-+		spin_lock_irqsave(&bgrp->lock, flags);

6554

-+

6555

-+		rcu_assign_pointer(leaf->bfqd, bfqd);

6556

-+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

6557

-+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

6558

-+

6559

-+		spin_unlock_irqrestore(&bgrp->lock, flags);

6560

-+

6561

-+		prev = leaf;

6562

-+		leaf = next;

6563

-+	}

6564

-+

6565

-+	BUG_ON(css == NULL && leaf != NULL);

6566

-+	if (css != NULL && prev != NULL) {

6567

-+		bgrp = css_to_bfqio(css);

6568

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

6569

-+		bfq_group_set_parent(prev, bfqg);

6570

-+	}

6571

-+}

6572

-+

6573

-+/**

6574

-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

6575

-+ * @bfqd: queue descriptor.

6576

-+ * @cgroup: cgroup being searched for.

6577

-+ *

6578

-+ * Return a group associated to @bfqd in @cgroup, allocating one if

6579

-+ * necessary.  When a group is returned all the cgroups in the path

6580

-+ * to the root have a group associated to @bfqd.

6581

-+ *

6582

-+ * If the allocation fails, return the root group: this breaks guarantees

6583

-+ * but is a safe fallbak.  If this loss becames a problem it can be

6584

-+ * mitigated using the equivalent weight (given by the product of the

6585

-+ * weights of the groups in the path from @group to the root) in the

6586

-+ * root scheduler.

6587

-+ *

6588

-+ * We allocate all the missing nodes in the path from the leaf cgroup

6589

-+ * to the root and we connect the nodes only after all the allocations

6590

-+ * have been successful.

6591

-+ */

6592

-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

6593

-+					      struct cgroup_subsys_state *css)

6594

-+{

6595

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

6596

-+	struct bfq_group *bfqg;

6597

-+

6598

-+	bfqg = bfqio_lookup_group(bgrp, bfqd);

6599

-+	if (bfqg != NULL)

6600

-+		return bfqg;

6601

-+

6602

-+	bfqg = bfq_group_chain_alloc(bfqd, css);

6603

-+	if (bfqg != NULL)

6604

-+		bfq_group_chain_link(bfqd, css, bfqg);

6605

-+	else

6606

-+		bfqg = bfqd->root_group;

6607

-+

6608

-+	return bfqg;

6609

-+}

6610

-+

6611

-+/**

6612

-+ * bfq_bfqq_move - migrate @bfqq to @bfqg.

6613

-+ * @bfqd: queue descriptor.

6614

-+ * @bfqq: the queue to move.

6615

-+ * @entity: @bfqq's entity.

6616

-+ * @bfqg: the group to move to.

6617

-+ *

6618

-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

6619

-+ * it on the new one.  Avoid putting the entity on the old group idle tree.

6620

-+ *

6621

-+ * Must be called under the queue lock; the cgroup owning @bfqg must

6622

-+ * not disappear (by now this just means that we are called under

6623

-+ * rcu_read_lock()).

6624

-+ */

6625

-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6626

-+			  struct bfq_entity *entity, struct bfq_group *bfqg)

6627

-+{

6628

-+	int busy, resume;

6629

-+

6630

-+	busy = bfq_bfqq_busy(bfqq);

6631

-+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

6632

-+

6633

-+	BUG_ON(resume && !entity->on_st);

6634

-+	BUG_ON(busy && !resume && entity->on_st &&

6635

-+	       bfqq != bfqd->in_service_queue);

6636

-+

6637

-+	if (busy) {

6638

-+		BUG_ON(atomic_read(&bfqq->ref) < 2);

6639

-+

6640

-+		if (!resume)

6641

-+			bfq_del_bfqq_busy(bfqd, bfqq, 0);

6642

-+		else

6643

-+			bfq_deactivate_bfqq(bfqd, bfqq, 0);

6644

-+	} else if (entity->on_st)

6645

-+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

6646

-+

6647

-+	/*

6648

-+	 * Here we use a reference to bfqg.  We don't need a refcounter

6649

-+	 * as the cgroup reference will not be dropped, so that its

6650

-+	 * destroy() callback will not be invoked.

6651

-+	 */

6652

-+	entity->parent = bfqg->my_entity;

6653

-+	entity->sched_data = &bfqg->sched_data;

6654

-+

6655

-+	if (busy && resume)

6656

-+		bfq_activate_bfqq(bfqd, bfqq);

6657

-+

6658

-+	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

6659

-+		bfq_schedule_dispatch(bfqd);

6660

-+}

6661

-+

6662

-+/**

6663

-+ * __bfq_bic_change_cgroup - move @bic to @cgroup.

6664

-+ * @bfqd: the queue descriptor.

6665

-+ * @bic: the bic to move.

6666

-+ * @cgroup: the cgroup to move to.

6667

-+ *

6668

-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

6669

-+ * has to make sure that the reference to cgroup is valid across the call.

6670

-+ *

6671

-+ * NOTE: an alternative approach might have been to store the current

6672

-+ * cgroup in bfqq and getting a reference to it, reducing the lookup

6673

-+ * time here, at the price of slightly more complex code.

6674

-+ */

6675

-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

6676

-+						struct bfq_io_cq *bic,

6677

-+						struct cgroup_subsys_state *css)

6678

-+{

6679

-+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

6680

-+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

6681

-+	struct bfq_entity *entity;

6682

-+	struct bfq_group *bfqg;

6683

-+	struct bfqio_cgroup *bgrp;

6684

-+

6685

-+	bgrp = css_to_bfqio(css);

6686

-+

6687

-+	bfqg = bfq_find_alloc_group(bfqd, css);

6688

-+	if (async_bfqq != NULL) {

6689

-+		entity = &async_bfqq->entity;

6690

-+

6691

-+		if (entity->sched_data != &bfqg->sched_data) {

6692

-+			bic_set_bfqq(bic, NULL, 0);

6693

-+			bfq_log_bfqq(bfqd, async_bfqq,

6694

-+				     "bic_change_group: %p %d",

6695

-+				     async_bfqq, atomic_read(&async_bfqq->ref));

6696

-+			bfq_put_queue(async_bfqq);

6697

-+		}

6698

-+	}

6699

-+

6700

-+	if (sync_bfqq != NULL) {

6701

-+		entity = &sync_bfqq->entity;

6702

-+		if (entity->sched_data != &bfqg->sched_data)

6703

-+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

6704

-+	}

6705

-+

6706

-+	return bfqg;

6707

-+}

6708

-+

6709

-+/**

6710

-+ * bfq_bic_change_cgroup - move @bic to @cgroup.

6711

-+ * @bic: the bic being migrated.

6712

-+ * @cgroup: the destination cgroup.

6713

-+ *

6714

-+ * When the task owning @bic is moved to @cgroup, @bic is immediately

6715

-+ * moved into its new parent group.

6716

-+ */

6717

-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

6718

-+				  struct cgroup_subsys_state *css)

6719

-+{

6720

-+	struct bfq_data *bfqd;

6721

-+	unsigned long uninitialized_var(flags);

6722

-+

6723

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

6724

-+				   &flags);

6725

-+	if (bfqd != NULL) {

6726

-+		__bfq_bic_change_cgroup(bfqd, bic, css);

6727

-+		bfq_put_bfqd_unlock(bfqd, &flags);

6728

-+	}

6729

-+}

6730

-+

6731

-+/**

6732

-+ * bfq_bic_update_cgroup - update the cgroup of @bic.

6733

-+ * @bic: the @bic to update.

6734

-+ *

6735

-+ * Make sure that @bic is enqueued in the cgroup of the current task.

6736

-+ * We need this in addition to moving bics during the cgroup attach

6737

-+ * phase because the task owning @bic could be at its first disk

6738

-+ * access or we may end up in the root cgroup as the result of a

6739

-+ * memory allocation failure and here we try to move to the right

6740

-+ * group.

6741

-+ *

6742

-+ * Must be called under the queue lock.  It is safe to use the returned

6743

-+ * value even after the rcu_read_unlock() as the migration/destruction

6744

-+ * paths act under the queue lock too.  IOW it is impossible to race with

6745

-+ * group migration/destruction and end up with an invalid group as:

6746

-+ *   a) here cgroup has not yet been destroyed, nor its destroy callback

6747

-+ *      has started execution, as current holds a reference to it,

6748

-+ *   b) if it is destroyed after rcu_read_unlock() [after current is

6749

-+ *      migrated to a different cgroup] its attach() callback will have

6750

-+ *      taken care of remove all the references to the old cgroup data.

6751

-+ */

6752

-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

6753

-+{

6754

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

6755

-+	struct bfq_group *bfqg;

6756

-+	struct cgroup_subsys_state *css;

6757

-+

6758

-+	BUG_ON(bfqd == NULL);

6759

-+

6760

-+	rcu_read_lock();

6761

-+	css = task_css(current, bfqio_subsys_id);

6762

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

6763

-+	rcu_read_unlock();

6764

-+

6765

-+	return bfqg;

6766

-+}

6767

-+

6768

-+/**

6769

-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

6770

-+ * @st: the service tree being flushed.

6771

-+ */

6772

-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

6773

-+{

6774

-+	struct bfq_entity *entity = st->first_idle;

6775

-+

6776

-+	for (; entity != NULL; entity = st->first_idle)

6777

-+		__bfq_deactivate_entity(entity, 0);

6778

-+}

6779

-+

6780

-+/**

6781

-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

6782

-+ * @bfqd: the device data structure with the root group.

6783

-+ * @entity: the entity to move.

6784

-+ */

6785

-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

6786

-+					    struct bfq_entity *entity)

6787

-+{

6788

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

6789

-+

6790

-+	BUG_ON(bfqq == NULL);

6791

-+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

6792

-+	return;

6793

-+}

6794

-+

6795

-+/**

6796

-+ * bfq_reparent_active_entities - move to the root group all active entities.

6797

-+ * @bfqd: the device data structure with the root group.

6798

-+ * @bfqg: the group to move from.

6799

-+ * @st: the service tree with the entities.

6800

-+ *

6801

-+ * Needs queue_lock to be taken and reference to be valid over the call.

6802

-+ */

6803

-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

6804

-+						struct bfq_group *bfqg,

6805

-+						struct bfq_service_tree *st)

6806

-+{

6807

-+	struct rb_root *active = &st->active;

6808

-+	struct bfq_entity *entity = NULL;

6809

-+

6810

-+	if (!RB_EMPTY_ROOT(&st->active))

6811

-+		entity = bfq_entity_of(rb_first(active));

6812

-+

6813

-+	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

6814

-+		bfq_reparent_leaf_entity(bfqd, entity);

6815

-+

6816

-+	if (bfqg->sched_data.in_service_entity != NULL)

6817

-+		bfq_reparent_leaf_entity(bfqd,

6818

-+			bfqg->sched_data.in_service_entity);

6819

-+

6820

-+	return;

6821

-+}

6822

-+

6823

-+/**

6824

-+ * bfq_destroy_group - destroy @bfqg.

6825

-+ * @bgrp: the bfqio_cgroup containing @bfqg.

6826

-+ * @bfqg: the group being destroyed.

6827

-+ *

6828

-+ * Destroy @bfqg, making sure that it is not referenced from its parent.

6829

-+ */

6830

-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

6831

-+{

6832

-+	struct bfq_data *bfqd;

6833

-+	struct bfq_service_tree *st;

6834

-+	struct bfq_entity *entity = bfqg->my_entity;

6835

-+	unsigned long uninitialized_var(flags);

6836

-+	int i;

6837

-+

6838

-+	hlist_del(&bfqg->group_node);

6839

-+

6840

-+	/*

6841

-+	 * Empty all service_trees belonging to this group before deactivating

6842

-+	 * the group itself.

6843

-+	 */

6844

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

6845

-+		st = bfqg->sched_data.service_tree + i;

6846

-+

6847

-+		/*

6848

-+		 * The idle tree may still contain bfq_queues belonging

6849

-+		 * to exited task because they never migrated to a different

6850

-+		 * cgroup from the one being destroyed now.  Noone else

6851

-+		 * can access them so it's safe to act without any lock.

6852

-+		 */

6853

-+		bfq_flush_idle_tree(st);

6854

-+

6855

-+		/*

6856

-+		 * It may happen that some queues are still active

6857

-+		 * (busy) upon group destruction (if the corresponding

6858

-+		 * processes have been forced to terminate). We move

6859

-+		 * all the leaf entities corresponding to these queues

6860

-+		 * to the root_group.

6861

-+		 * Also, it may happen that the group has an entity

6862

-+		 * under service, which is disconnected from the active

6863

-+		 * tree: it must be moved, too.

6864

-+		 * There is no need to put the sync queues, as the

6865

-+		 * scheduler has taken no reference.

6866

-+		 */

6867

-+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6868

-+		if (bfqd != NULL) {

6869

-+			bfq_reparent_active_entities(bfqd, bfqg, st);

6870

-+			bfq_put_bfqd_unlock(bfqd, &flags);

6871

-+		}

6872

-+		BUG_ON(!RB_EMPTY_ROOT(&st->active));

6873

-+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

6874

-+	}

6875

-+	BUG_ON(bfqg->sched_data.next_in_service != NULL);

6876

-+	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

6877

-+

6878

-+	/*

6879

-+	 * We may race with device destruction, take extra care when

6880

-+	 * dereferencing bfqg->bfqd.

6881

-+	 */

6882

-+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

6883

-+	if (bfqd != NULL) {

6884

-+		hlist_del(&bfqg->bfqd_node);

6885

-+		__bfq_deactivate_entity(entity, 0);

6886

-+		bfq_put_async_queues(bfqd, bfqg);

6887

-+		bfq_put_bfqd_unlock(bfqd, &flags);

6888

-+	}

6889

-+	BUG_ON(entity->tree != NULL);

6890

-+

6891

-+	/*

6892

-+	 * No need to defer the kfree() to the end of the RCU grace

6893

-+	 * period: we are called from the destroy() callback of our

6894

-+	 * cgroup, so we can be sure that noone is a) still using

6895

-+	 * this cgroup or b) doing lookups in it.

6896

-+	 */

6897

-+	kfree(bfqg);

6898

-+}

6899

-+

6900

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

6901

-+{

6902

-+	struct hlist_node *tmp;

6903

-+	struct bfq_group *bfqg;

6904

-+

6905

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

6906

-+		bfq_end_raising_async_queues(bfqd, bfqg);

6907

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

6908

-+}

6909

-+

6910

-+/**

6911

-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

6912

-+ * @bfqd: the device descriptor being exited.

6913

-+ *

6914

-+ * When the device exits we just make sure that no lookup can return

6915

-+ * the now unused group structures.  They will be deallocated on cgroup

6916

-+ * destruction.

6917

-+ */

6918

-+static void bfq_disconnect_groups(struct bfq_data *bfqd)

6919

-+{

6920

-+	struct hlist_node *tmp;

6921

-+	struct bfq_group *bfqg;

6922

-+

6923

-+	bfq_log(bfqd, "disconnect_groups beginning");

6924

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

6925

-+		hlist_del(&bfqg->bfqd_node);

6926

-+

6927

-+		__bfq_deactivate_entity(bfqg->my_entity, 0);

6928

-+

6929

-+		/*

6930

-+		 * Don't remove from the group hash, just set an

6931

-+		 * invalid key.  No lookups can race with the

6932

-+		 * assignment as bfqd is being destroyed; this

6933

-+		 * implies also that new elements cannot be added

6934

-+		 * to the list.

6935

-+		 */

6936

-+		rcu_assign_pointer(bfqg->bfqd, NULL);

6937

-+

6938

-+		bfq_log(bfqd, "disconnect_groups: put async for group %p",

6939

-+			bfqg);

6940

-+		bfq_put_async_queues(bfqd, bfqg);

6941

-+	}

6942

-+}

6943

-+

6944

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

6945

-+{

6946

-+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

6947

-+	struct bfq_group *bfqg = bfqd->root_group;

6948

-+

6949

-+	bfq_put_async_queues(bfqd, bfqg);

6950

-+

6951

-+	spin_lock_irq(&bgrp->lock);

6952

-+	hlist_del_rcu(&bfqg->group_node);

6953

-+	spin_unlock_irq(&bgrp->lock);

6954

-+

6955

-+	/*

6956

-+	 * No need to synchronize_rcu() here: since the device is gone

6957

-+	 * there cannot be any read-side access to its root_group.

6958

-+	 */

6959

-+	kfree(bfqg);

6960

-+}

6961

-+

6962

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

6963

-+{

6964

-+	struct bfq_group *bfqg;

6965

-+	struct bfqio_cgroup *bgrp;

6966

-+	int i;

6967

-+

6968

-+	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

6969

-+	if (bfqg == NULL)

6970

-+		return NULL;

6971

-+

6972

-+	bfqg->entity.parent = NULL;

6973

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

6974

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

6975

-+

6976

-+	bgrp = &bfqio_root_cgroup;

6977

-+	spin_lock_irq(&bgrp->lock);

6978

-+	rcu_assign_pointer(bfqg->bfqd, bfqd);

6979

-+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

6980

-+	spin_unlock_irq(&bgrp->lock);

6981

-+

6982

-+	return bfqg;

6983

-+}

6984

-+

6985

-+#define SHOW_FUNCTION(__VAR)						\

6986

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

6987

-+				       struct cftype *cftype)		\

6988

-+{									\

6989

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

6990

-+	u64 ret = -ENODEV;						\

6991

-+									\

6992

-+	mutex_lock(&bfqio_mutex);					\

6993

-+	if (bfqio_is_removed(bgrp))					\

6994

-+		goto out_unlock;					\

6995

-+									\

6996

-+	spin_lock_irq(&bgrp->lock);					\

6997

-+	ret = bgrp->__VAR;						\

6998

-+	spin_unlock_irq(&bgrp->lock);					\

6999

-+									\

7000

-+out_unlock:								\

7001

-+	mutex_unlock(&bfqio_mutex);					\

7002

-+	return ret;							\

7003

-+}

7004

-+

7005

-+SHOW_FUNCTION(weight);

7006

-+SHOW_FUNCTION(ioprio);

7007

-+SHOW_FUNCTION(ioprio_class);

7008

-+#undef SHOW_FUNCTION

7009

-+

7010

-+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

7011

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

7012

-+					struct cftype *cftype,		\

7013

-+					u64 val)			\

7014

-+{									\

7015

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

7016

-+	struct bfq_group *bfqg;						\

7017

-+	int ret = -EINVAL;						\

7018

-+									\

7019

-+	if (val < (__MIN) || val > (__MAX))				\

7020

-+		return ret;						\

7021

-+									\

7022

-+	ret = -ENODEV;							\

7023

-+	mutex_lock(&bfqio_mutex);					\

7024

-+	if (bfqio_is_removed(bgrp))					\

7025

-+		goto out_unlock;					\

7026

-+	ret = 0;							\

7027

-+									\

7028

-+	spin_lock_irq(&bgrp->lock);					\

7029

-+	bgrp->__VAR = (unsigned short)val;				\

7030

-+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

7031

-+		/*							\

7032

-+		 * Setting the ioprio_changed flag of the entity        \

7033

-+		 * to 1 with new_##__VAR == ##__VAR would re-set        \

7034

-+		 * the value of the weight to its ioprio mapping.       \

7035

-+		 * Set the flag only if necessary.                      \

7036

-+		 */                                                     \

7037

-+		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

7038

-+			bfqg->entity.new_##__VAR = (unsigned short)val; \

7039

-+			smp_wmb();                                      \

7040

-+			bfqg->entity.ioprio_changed = 1;                \

7041

-+		}							\

7042

-+	}								\

7043

-+	spin_unlock_irq(&bgrp->lock);					\

7044

-+									\

7045

-+out_unlock:								\

7046

-+	mutex_unlock(&bfqio_mutex);					\

7047

-+	return ret;							\

7048

-+}

7049

-+

7050

-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

7051

-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

7052

-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

7053

-+#undef STORE_FUNCTION

7054

-+

7055

-+static struct cftype bfqio_files[] = {

7056

-+	{

7057

-+		.name = "weight",

7058

-+		.read_u64 = bfqio_cgroup_weight_read,

7059

-+		.write_u64 = bfqio_cgroup_weight_write,

7060

-+	},

7061

-+	{

7062

-+		.name = "ioprio",

7063

-+		.read_u64 = bfqio_cgroup_ioprio_read,

7064

-+		.write_u64 = bfqio_cgroup_ioprio_write,

7065

-+	},

7066

-+	{

7067

-+		.name = "ioprio_class",

7068

-+		.read_u64 = bfqio_cgroup_ioprio_class_read,

7069

-+		.write_u64 = bfqio_cgroup_ioprio_class_write,

7070

-+	},

7071

-+	{ },	/* terminate */

7072

-+};

7073

-+

7074

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

7075

-+						*parent_css)

7076

-+{

7077

-+	struct bfqio_cgroup *bgrp;

7078

-+

7079

-+	if (parent_css != NULL) {

7080

-+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

7081

-+		if (bgrp == NULL)

7082

-+			return ERR_PTR(-ENOMEM);

7083

-+	} else

7084

-+		bgrp = &bfqio_root_cgroup;

7085

-+

7086

-+	spin_lock_init(&bgrp->lock);

7087

-+	INIT_HLIST_HEAD(&bgrp->group_data);

7088

-+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

7089

-+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

7090

-+

7091

-+	return &bgrp->css;

7092

-+}

7093

-+

7094

-+/*

7095

-+ * We cannot support shared io contexts, as we have no means to support

7096

-+ * two tasks with the same ioc in two different groups without major rework

7097

-+ * of the main bic/bfqq data structures.  By now we allow a task to change

7098

-+ * its cgroup only if it's the only owner of its ioc; the drawback of this

7099

-+ * behavior is that a group containing a task that forked using CLONE_IO

7100

-+ * will not be destroyed until the tasks sharing the ioc die.

7101

-+ */

7102

-+static int bfqio_can_attach(struct cgroup_subsys_state *css,

7103

-+			    struct cgroup_taskset *tset)

7104

-+{

7105

-+	struct task_struct *task;

7106

-+	struct io_context *ioc;

7107

-+	int ret = 0;

7108

-+

7109

-+	cgroup_taskset_for_each(task, css, tset) {

7110

-+		/*

7111

-+		 * task_lock() is needed to avoid races with

7112

-+		 * exit_io_context()

7113

-+		 */

7114

-+		task_lock(task);

7115

-+		ioc = task->io_context;

7116

-+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

7117

-+			/*

7118

-+			 * ioc == NULL means that the task is either too young

7119

-+			 * or exiting: if it has still no ioc the ioc can't be

7120

-+			 * shared, if the task is exiting the attach will fail

7121

-+			 * anyway, no matter what we return here.

7122

-+			 */

7123

-+			ret = -EINVAL;

7124

-+		task_unlock(task);

7125

-+		if (ret)

7126

-+			break;

7127

-+	}

7128

-+

7129

-+	return ret;

7130

-+}

7131

-+

7132

-+static void bfqio_attach(struct cgroup_subsys_state *css,

7133

-+			 struct cgroup_taskset *tset)

7134

-+{

7135

-+	struct task_struct *task;

7136

-+	struct io_context *ioc;

7137

-+	struct io_cq *icq;

7138

-+

7139

-+	/*

7140

-+	 * IMPORTANT NOTE: The move of more than one process at a time to a

7141

-+	 * new group has not yet been tested.

7142

-+	 */

7143

-+	cgroup_taskset_for_each(task, css, tset) {

7144

-+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

7145

-+		if (ioc) {

7146

-+			/*

7147

-+			 * Handle cgroup change here.

7148

-+			 */

7149

-+			rcu_read_lock();

7150

-+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

7151

-+				if (!strncmp(

7152

-+					icq->q->elevator->type->elevator_name,

7153

-+					"bfq", ELV_NAME_MAX))

7154

-+					bfq_bic_change_cgroup(icq_to_bic(icq),

7155

-+							      css);

7156

-+			rcu_read_unlock();

7157

-+			put_io_context(ioc);

7158

-+		}

7159

-+	}

7160

-+}

7161

-+

7162

-+static void bfqio_destroy(struct cgroup_subsys_state *css)

7163

-+{

7164

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7165

-+	struct hlist_node *tmp;

7166

-+	struct bfq_group *bfqg;

7167

-+

7168

-+	/*

7169

-+	 * Since we are destroying the cgroup, there are no more tasks

7170

-+	 * referencing it, and all the RCU grace periods that may have

7171

-+	 * referenced it are ended (as the destruction of the parent

7172

-+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

7173

-+	 * anything else and we don't need any synchronization.

7174

-+	 */

7175

-+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

7176

-+		bfq_destroy_group(bgrp, bfqg);

7177

-+

7178

-+	BUG_ON(!hlist_empty(&bgrp->group_data));

7179

-+

7180

-+	kfree(bgrp);

7181

-+}

7182

-+

7183

-+static int bfqio_css_online(struct cgroup_subsys_state *css)

7184

-+{

7185

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7186

-+

7187

-+	mutex_lock(&bfqio_mutex);

7188

-+	bgrp->online = true;

7189

-+	mutex_unlock(&bfqio_mutex);

7190

-+

7191

-+	return 0;

7192

-+}

7193

-+

7194

-+static void bfqio_css_offline(struct cgroup_subsys_state *css)

7195

-+{

7196

-+	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

7197

-+

7198

-+	mutex_lock(&bfqio_mutex);

7199

-+	bgrp->online = false;

7200

-+	mutex_unlock(&bfqio_mutex);

7201

-+}

7202

-+

7203

-+struct cgroup_subsys bfqio_subsys = {

7204

-+	.name = "bfqio",

7205

-+	.css_alloc = bfqio_create,

7206

-+	.css_online = bfqio_css_online,

7207

-+	.css_offline = bfqio_css_offline,

7208

-+	.can_attach = bfqio_can_attach,

7209

-+	.attach = bfqio_attach,

7210

-+	.css_free = bfqio_destroy,

7211

-+	.subsys_id = bfqio_subsys_id,

7212

-+	.base_cftypes = bfqio_files,

7213

-+};

7214

-+#else

7215

-+static inline void bfq_init_entity(struct bfq_entity *entity,

7216

-+				   struct bfq_group *bfqg)

7217

-+{

7218

-+	entity->weight = entity->new_weight;

7219

-+	entity->orig_weight = entity->new_weight;

7220

-+	entity->ioprio = entity->new_ioprio;

7221

-+	entity->ioprio_class = entity->new_ioprio_class;

7222

-+	entity->sched_data = &bfqg->sched_data;

7223

-+}

7224

-+

7225

-+static inline struct bfq_group *

7226

-+bfq_bic_update_cgroup(struct bfq_io_cq *bic)

7227

-+{

7228

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

7229

-+	return bfqd->root_group;

7230

-+}

7231

-+

7232

-+static inline void bfq_bfqq_move(struct bfq_data *bfqd,

7233

-+				 struct bfq_queue *bfqq,

7234

-+				 struct bfq_entity *entity,

7235

-+				 struct bfq_group *bfqg)

7236

-+{

7237

-+}

7238

-+

7239

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

7240

-+{

7241

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

7242

-+}

7243

-+

7244

-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

7245

-+{

7246

-+	bfq_put_async_queues(bfqd, bfqd->root_group);

7247

-+}

7248

-+

7249

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

7250

-+{

7251

-+	kfree(bfqd->root_group);

7252

-+}

7253

-+

7254

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

7255

-+{

7256

-+	struct bfq_group *bfqg;

7257

-+	int i;

7258

-+

7259

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

7260

-+	if (bfqg == NULL)

7261

-+		return NULL;

7262

-+

7263

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

7264

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

7265

-+

7266

-+	return bfqg;

7267

-+}

7268

-+#endif

7269

-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

7270

-new file mode 100644

7271

-index 0000000..7f6b000

7272

---- /dev/null

7273

-+++ b/block/bfq-ioc.c

7274

-@@ -0,0 +1,36 @@

7275

-+/*

7276

-+ * BFQ: I/O context handling.

7277

-+ *

7278

-+ * Based on ideas and code from CFQ:

7279

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7280

-+ *

7281

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7282

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

7283

-+ *

7284

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7285

-+ */

7286

-+

7287

-+/**

7288

-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

7289

-+ * @icq: the iocontext queue.

7290

-+ */

7291

-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

7292

-+{

7293

-+	/* bic->icq is the first member, %NULL will convert to %NULL */

7294

-+	return container_of(icq, struct bfq_io_cq, icq);

7295

-+}

7296

-+

7297

-+/**

7298

-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

7299

-+ * @bfqd: the lookup key.

7300

-+ * @ioc: the io_context of the process doing I/O.

7301

-+ *

7302

-+ * Queue lock must be held.

7303

-+ */

7304

-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

7305

-+					       struct io_context *ioc)

7306

-+{

7307

-+	if (ioc)

7308

-+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

7309

-+	return NULL;

7310

-+}

7311

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7312

-new file mode 100644

7313

-index 0000000..eb760de

7314

---- /dev/null

7315

-+++ b/block/bfq-iosched.c

7316

-@@ -0,0 +1,3298 @@

7317

-+/*

7318

-+ * BFQ, or Budget Fair Queueing, disk scheduler.

7319

-+ *

7320

-+ * Based on ideas and code from CFQ:

7321

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7322

-+ *

7323

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7324

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

7325

-+ *

7326

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7327

-+ *

7328

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

7329

-+ *

7330

-+ * BFQ is a proportional share disk scheduling algorithm based on the

7331

-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in

7332

-+ * number of sectors, to tasks instead of time slices. The disk is not granted

7333

-+ * to the in-service task for a given time slice, but until it has exahusted

7334

-+ * its assigned budget. This change from the time to the service domain allows

7335

-+ * BFQ to distribute the disk bandwidth among tasks as desired, without any

7336

-+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an

7337

-+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to

7338

-+ * their budgets (more precisely BFQ schedules queues associated to tasks).

7339

-+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to

7340

-+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low

7341

-+ * latencies to interactive and soft real-time applications.

7342

-+ *

7343

-+ * BFQ is described in [1], where also a reference to the initial, more

7344

-+ * theoretical paper on BFQ can be found. The interested reader can find in

7345

-+ * the latter paper full details on the main algorithm as well as formulas of

7346

-+ * the guarantees, plus formal proofs of all the properties. With respect to

7347

-+ * the version of BFQ presented in these papers, this implementation adds a

7348

-+ * few more heuristics, such as the one that guarantees a low latency to soft

7349

-+ * real-time applications, and a hierarchical extension based on H-WF2Q+.

7350

-+ *

7351

-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

7352

-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

7353

-+ * complexity derives from the one introduced with EEVDF in [3].

7354

-+ *

7355

-+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

7356

-+ *     with the BFQ Disk I/O Scheduler'',

7357

-+ *     Proceedings of the 5th Annual International Systems and Storage

7358

-+ *     Conference (SYSTOR '12), June 2012.

7359

-+ *

7360

-+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

7361

-+ *

7362

-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

7363

-+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

7364

-+ *     Oct 1997.

7365

-+ *

7366

-+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

7367

-+ *

7368

-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

7369

-+ *     First: A Flexible and Accurate Mechanism for Proportional Share

7370

-+ *     Resource Allocation,'' technical report.

7371

-+ *

7372

-+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

7373

-+ */

7374

-+#include <linux/module.h>

7375

-+#include <linux/slab.h>

7376

-+#include <linux/blkdev.h>

7377

-+#include <linux/cgroup.h>

7378

-+#include <linux/elevator.h>

7379

-+#include <linux/jiffies.h>

7380

-+#include <linux/rbtree.h>

7381

-+#include <linux/ioprio.h>

7382

-+#include "bfq.h"

7383

-+#include "blk.h"

7384

-+

7385

-+/* Max number of dispatches in one round of service. */

7386

-+static const int bfq_quantum = 4;

7387

-+

7388

-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */

7389

-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

7390

-+

7391

-+/* Maximum backwards seek, in KiB. */

7392

-+static const int bfq_back_max = 16 * 1024;

7393

-+

7394

-+/* Penalty of a backwards seek, in number of sectors. */

7395

-+static const int bfq_back_penalty = 2;

7396

-+

7397

-+/* Idling period duration, in jiffies. */

7398

-+static int bfq_slice_idle = HZ / 125;

7399

-+

7400

-+/* Default maximum budget values, in sectors and number of requests. */

7401

-+static const int bfq_default_max_budget = 16 * 1024;

7402

-+static const int bfq_max_budget_async_rq = 4;

7403

-+

7404

-+/*

7405

-+ * Async to sync throughput distribution is controlled as follows:

7406

-+ * when an async request is served, the entity is charged the number

7407

-+ * of sectors of the request, multipled by the factor below

7408

-+ */

7409

-+static const int bfq_async_charge_factor = 10;

7410

-+

7411

-+/* Default timeout values, in jiffies, approximating CFQ defaults. */

7412

-+static const int bfq_timeout_sync = HZ / 8;

7413

-+static int bfq_timeout_async = HZ / 25;

7414

-+

7415

-+struct kmem_cache *bfq_pool;

7416

-+

7417

-+/* Below this threshold (in ms), we consider thinktime immediate. */

7418

-+#define BFQ_MIN_TT		2

7419

-+

7420

-+/* hw_tag detection: parallel requests threshold and min samples needed. */

7421

-+#define BFQ_HW_QUEUE_THRESHOLD	4

7422

-+#define BFQ_HW_QUEUE_SAMPLES	32

7423

-+

7424

-+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

7425

-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

7426

-+

7427

-+/* Min samples used for peak rate estimation (for autotuning). */

7428

-+#define BFQ_PEAK_RATE_SAMPLES	32

7429

-+

7430

-+/* Shift used for peak rate fixed precision calculations. */

7431

-+#define BFQ_RATE_SHIFT		16

7432

-+

7433

-+/*

7434

-+ * The duration of the weight raising for interactive applications is

7435

-+ * computed automatically (as default behaviour), using the following

7436

-+ * formula: duration = (R / r) * T, where r is the peak rate of the

7437

-+ * disk, and R and T are two reference parameters. In particular, R is

7438

-+ * the peak rate of a reference disk, and T is about the maximum time

7439

-+ * for starting popular large applications on that disk, under BFQ and

7440

-+ * while reading two files in parallel. Finally, BFQ uses two

7441

-+ * different pairs (R, T) depending on whether the disk is rotational

7442

-+ * or non-rotational.

7443

-+ */

7444

-+#define T_rot			(msecs_to_jiffies(5500))

7445

-+#define T_nonrot		(msecs_to_jiffies(2000))

7446

-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

7447

-+#define R_rot			17415

7448

-+#define R_nonrot		34791

7449

-+

7450

-+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

7451

-+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

7452

-+

7453

-+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

7454

-+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

7455

-+

7456

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

7457

-+

7458

-+#include "bfq-ioc.c"

7459

-+#include "bfq-sched.c"

7460

-+#include "bfq-cgroup.c"

7461

-+

7462

-+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

7463

-+				 IOPRIO_CLASS_IDLE)

7464

-+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

7465

-+				 IOPRIO_CLASS_RT)

7466

-+

7467

-+#define bfq_sample_valid(samples)	((samples) > 80)

7468

-+

7469

-+/*

7470

-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit

7471

-+ * set (in which case it could also be a direct WRITE).

7472

-+ */

7473

-+static inline int bfq_bio_sync(struct bio *bio)

7474

-+{

7475

-+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

7476

-+		return 1;

7477

-+

7478

-+	return 0;

7479

-+}

7480

-+

7481

-+/*

7482

-+ * Scheduler run of queue, if there are requests pending and no one in the

7483

-+ * driver that will restart queueing.

7484

-+ */

7485

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

7486

-+{

7487

-+	if (bfqd->queued != 0) {

7488

-+		bfq_log(bfqd, "schedule dispatch");

7489

-+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

7490

-+	}

7491

-+}

7492

-+

7493

-+/*

7494

-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

7495

-+ * We choose the request that is closesr to the head right now.  Distance

7496

-+ * behind the head is penalized and only allowed to a certain extent.

7497

-+ */

7498

-+static struct request *bfq_choose_req(struct bfq_data *bfqd,

7499

-+				      struct request *rq1,

7500

-+				      struct request *rq2,

7501

-+				      sector_t last)

7502

-+{

7503

-+	sector_t s1, s2, d1 = 0, d2 = 0;

7504

-+	unsigned long back_max;

7505

-+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

7506

-+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

7507

-+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

7508

-+

7509

-+	if (rq1 == NULL || rq1 == rq2)

7510

-+		return rq2;

7511

-+	if (rq2 == NULL)

7512

-+		return rq1;

7513

-+

7514

-+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

7515

-+		return rq1;

7516

-+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

7517

-+		return rq2;

7518

-+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

7519

-+		return rq1;

7520

-+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

7521

-+		return rq2;

7522

-+

7523

-+	s1 = blk_rq_pos(rq1);

7524

-+	s2 = blk_rq_pos(rq2);

7525

-+

7526

-+	/*

7527

-+	 * By definition, 1KiB is 2 sectors.

7528

-+	 */

7529

-+	back_max = bfqd->bfq_back_max * 2;

7530

-+

7531

-+	/*

7532

-+	 * Strict one way elevator _except_ in the case where we allow

7533

-+	 * short backward seeks which are biased as twice the cost of a

7534

-+	 * similar forward seek.

7535

-+	 */

7536

-+	if (s1 >= last)

7537

-+		d1 = s1 - last;

7538

-+	else if (s1 + back_max >= last)

7539

-+		d1 = (last - s1) * bfqd->bfq_back_penalty;

7540

-+	else

7541

-+		wrap |= BFQ_RQ1_WRAP;

7542

-+

7543

-+	if (s2 >= last)

7544

-+		d2 = s2 - last;

7545

-+	else if (s2 + back_max >= last)

7546

-+		d2 = (last - s2) * bfqd->bfq_back_penalty;

7547

-+	else

7548

-+		wrap |= BFQ_RQ2_WRAP;

7549

-+

7550

-+	/* Found required data */

7551

-+

7552

-+	/*

7553

-+	 * By doing switch() on the bit mask "wrap" we avoid having to

7554

-+	 * check two variables for all permutations: --> faster!

7555

-+	 */

7556

-+	switch (wrap) {

7557

-+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

7558

-+		if (d1 < d2)

7559

-+			return rq1;

7560

-+		else if (d2 < d1)

7561

-+			return rq2;

7562

-+		else {

7563

-+			if (s1 >= s2)

7564

-+				return rq1;

7565

-+			else

7566

-+				return rq2;

7567

-+		}

7568

-+

7569

-+	case BFQ_RQ2_WRAP:

7570

-+		return rq1;

7571

-+	case BFQ_RQ1_WRAP:

7572

-+		return rq2;

7573

-+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

7574

-+	default:

7575

-+		/*

7576

-+		 * Since both rqs are wrapped,

7577

-+		 * start with the one that's further behind head

7578

-+		 * (--> only *one* back seek required),

7579

-+		 * since back seek takes more time than forward.

7580

-+		 */

7581

-+		if (s1 <= s2)

7582

-+			return rq1;

7583

-+		else

7584

-+			return rq2;

7585

-+	}

7586

-+}

7587

-+

7588

-+static struct bfq_queue *

7589

-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

7590

-+		     sector_t sector, struct rb_node **ret_parent,

7591

-+		     struct rb_node ***rb_link)

7592

-+{

7593

-+	struct rb_node **p, *parent;

7594

-+	struct bfq_queue *bfqq = NULL;

7595

-+

7596

-+	parent = NULL;

7597

-+	p = &root->rb_node;

7598

-+	while (*p) {

7599

-+		struct rb_node **n;

7600

-+

7601

-+		parent = *p;

7602

-+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7603

-+

7604

-+		/*

7605

-+		 * Sort strictly based on sector. Smallest to the left,

7606

-+		 * largest to the right.

7607

-+		 */

7608

-+		if (sector > blk_rq_pos(bfqq->next_rq))

7609

-+			n = &(*p)->rb_right;

7610

-+		else if (sector < blk_rq_pos(bfqq->next_rq))

7611

-+			n = &(*p)->rb_left;

7612

-+		else

7613

-+			break;

7614

-+		p = n;

7615

-+		bfqq = NULL;

7616

-+	}

7617

-+

7618

-+	*ret_parent = parent;

7619

-+	if (rb_link)

7620

-+		*rb_link = p;

7621

-+

7622

-+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

7623

-+		(long long unsigned)sector,

7624

-+		bfqq != NULL ? bfqq->pid : 0);

7625

-+

7626

-+	return bfqq;

7627

-+}

7628

-+

7629

-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7630

-+{

7631

-+	struct rb_node **p, *parent;

7632

-+	struct bfq_queue *__bfqq;

7633

-+

7634

-+	if (bfqq->pos_root != NULL) {

7635

-+		rb_erase(&bfqq->pos_node, bfqq->pos_root);

7636

-+		bfqq->pos_root = NULL;

7637

-+	}

7638

-+

7639

-+	if (bfq_class_idle(bfqq))

7640

-+		return;

7641

-+	if (!bfqq->next_rq)

7642

-+		return;

7643

-+

7644

-+	bfqq->pos_root = &bfqd->rq_pos_tree;

7645

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

7646

-+			blk_rq_pos(bfqq->next_rq), &parent, &p);

7647

-+	if (__bfqq == NULL) {

7648

-+		rb_link_node(&bfqq->pos_node, parent, p);

7649

-+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

7650

-+	} else

7651

-+		bfqq->pos_root = NULL;

7652

-+}

7653

-+

7654

-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

7655

-+					struct bfq_queue *bfqq,

7656

-+					struct request *last)

7657

-+{

7658

-+	struct rb_node *rbnext = rb_next(&last->rb_node);

7659

-+	struct rb_node *rbprev = rb_prev(&last->rb_node);

7660

-+	struct request *next = NULL, *prev = NULL;

7661

-+

7662

-+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

7663

-+

7664

-+	if (rbprev != NULL)

7665

-+		prev = rb_entry_rq(rbprev);

7666

-+

7667

-+	if (rbnext != NULL)

7668

-+		next = rb_entry_rq(rbnext);

7669

-+	else {

7670

-+		rbnext = rb_first(&bfqq->sort_list);

7671

-+		if (rbnext && rbnext != &last->rb_node)

7672

-+			next = rb_entry_rq(rbnext);

7673

-+	}

7674

-+

7675

-+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

7676

-+}

7677

-+

7678

-+static void bfq_del_rq_rb(struct request *rq)

7679

-+{

7680

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7681

-+	struct bfq_data *bfqd = bfqq->bfqd;

7682

-+	const int sync = rq_is_sync(rq);

7683

-+

7684

-+	BUG_ON(bfqq->queued[sync] == 0);

7685

-+	bfqq->queued[sync]--;

7686

-+	bfqd->queued--;

7687

-+

7688

-+	elv_rb_del(&bfqq->sort_list, rq);

7689

-+

7690

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

7691

-+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

7692

-+			bfq_del_bfqq_busy(bfqd, bfqq, 1);

7693

-+		/*

7694

-+		 * Remove queue from request-position tree as it is empty.

7695

-+		 */

7696

-+		if (bfqq->pos_root != NULL) {

7697

-+			rb_erase(&bfqq->pos_node, bfqq->pos_root);

7698

-+			bfqq->pos_root = NULL;

7699

-+		}

7700

-+	}

7701

-+}

7702

-+

7703

-+/* see the definition of bfq_async_charge_factor for details */

7704

-+static inline unsigned long bfq_serv_to_charge(struct request *rq,

7705

-+					       struct bfq_queue *bfqq)

7706

-+{

7707

-+	return blk_rq_sectors(rq) *

7708

-+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

7709

-+		bfq_async_charge_factor));

7710

-+}

7711

-+

7712

-+/**

7713

-+ * bfq_updated_next_req - update the queue after a new next_rq selection.

7714

-+ * @bfqd: the device data the queue belongs to.

7715

-+ * @bfqq: the queue to update.

7716

-+ *

7717

-+ * If the first request of a queue changes we make sure that the queue

7718

-+ * has enough budget to serve at least its first request (if the

7719

-+ * request has grown).  We do this because if the queue has not enough

7720

-+ * budget for its first request, it has to go through two dispatch

7721

-+ * rounds to actually get it dispatched.

7722

-+ */

7723

-+static void bfq_updated_next_req(struct bfq_data *bfqd,

7724

-+				 struct bfq_queue *bfqq)

7725

-+{

7726

-+	struct bfq_entity *entity = &bfqq->entity;

7727

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

7728

-+	struct request *next_rq = bfqq->next_rq;

7729

-+	unsigned long new_budget;

7730

-+

7731

-+	if (next_rq == NULL)

7732

-+		return;

7733

-+

7734

-+	if (bfqq == bfqd->in_service_queue)

7735

-+		/*

7736

-+		 * In order not to break guarantees, budgets cannot be

7737

-+		 * changed after an entity has been selected.

7738

-+		 */

7739

-+		return;

7740

-+

7741

-+	BUG_ON(entity->tree != &st->active);

7742

-+	BUG_ON(entity == entity->sched_data->in_service_entity);

7743

-+

7744

-+	new_budget = max_t(unsigned long, bfqq->max_budget,

7745

-+			   bfq_serv_to_charge(next_rq, bfqq));

7746

-+	entity->budget = new_budget;

7747

-+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

7748

-+	bfq_activate_bfqq(bfqd, bfqq);

7749

-+}

7750

-+

7751

-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

7752

-+{

7753

-+	u64 dur;

7754

-+

7755

-+	if (bfqd->bfq_raising_max_time > 0)

7756

-+		return bfqd->bfq_raising_max_time;

7757

-+

7758

-+	dur = bfqd->RT_prod;

7759

-+	do_div(dur, bfqd->peak_rate);

7760

-+

7761

-+	return dur;

7762

-+}

7763

-+

7764

-+static void bfq_add_rq_rb(struct request *rq)

7765

-+{

7766

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7767

-+	struct bfq_entity *entity = &bfqq->entity;

7768

-+	struct bfq_data *bfqd = bfqq->bfqd;

7769

-+	struct request *next_rq, *prev;

7770

-+	unsigned long old_raising_coeff = bfqq->raising_coeff;

7771

-+	int idle_for_long_time = 0;

7772

-+

7773

-+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

7774

-+	bfqq->queued[rq_is_sync(rq)]++;

7775

-+	bfqd->queued++;

7776

-+

7777

-+	elv_rb_add(&bfqq->sort_list, rq);

7778

-+

7779

-+	/*

7780

-+	 * Check if this request is a better next-serve candidate.

7781

-+	 */

7782

-+	prev = bfqq->next_rq;

7783

-+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

7784

-+	BUG_ON(next_rq == NULL);

7785

-+	bfqq->next_rq = next_rq;

7786

-+

7787

-+	/*

7788

-+	 * Adjust priority tree position, if next_rq changes.

7789

-+	 */

7790

-+	if (prev != bfqq->next_rq)

7791

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

7792

-+

7793

-+	if (!bfq_bfqq_busy(bfqq)) {

7794

-+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

7795

-+			time_is_before_jiffies(bfqq->soft_rt_next_start);

7796

-+		idle_for_long_time = time_is_before_jiffies(

7797

-+			bfqq->budget_timeout +

7798

-+			bfqd->bfq_raising_min_idle_time);

7799

-+		entity->budget = max_t(unsigned long, bfqq->max_budget,

7800

-+				       bfq_serv_to_charge(next_rq, bfqq));

7801

-+

7802

-+		if (!bfqd->low_latency)

7803

-+			goto add_bfqq_busy;

7804

-+

7805

-+		/*

7806

-+		 * If the queue is not being boosted and has been idle

7807

-+		 * for enough time, start a weight-raising period

7808

-+		 */

7809

-+		if (old_raising_coeff == 1 &&

7810

-+		    (idle_for_long_time || soft_rt)) {

7811

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7812

-+			if (idle_for_long_time)

7813

-+				bfqq->raising_cur_max_time =

7814

-+					bfq_wrais_duration(bfqd);

7815

-+			else

7816

-+				bfqq->raising_cur_max_time =

7817

-+					bfqd->bfq_raising_rt_max_time;

7818

-+			bfq_log_bfqq(bfqd, bfqq,

7819

-+				     "wrais starting at %lu, "

7820

-+				     "rais_max_time %u",

7821

-+				     jiffies,

7822

-+				     jiffies_to_msecs(bfqq->

7823

-+					raising_cur_max_time));

7824

-+		} else if (old_raising_coeff > 1) {

7825

-+			if (idle_for_long_time)

7826

-+				bfqq->raising_cur_max_time =

7827

-+					bfq_wrais_duration(bfqd);

7828

-+			else if (bfqq->raising_cur_max_time ==

7829

-+				 bfqd->bfq_raising_rt_max_time &&

7830

-+				 !soft_rt) {

7831

-+				bfqq->raising_coeff = 1;

7832

-+				bfq_log_bfqq(bfqd, bfqq,

7833

-+					     "wrais ending at %lu, "

7834

-+					     "rais_max_time %u",

7835

-+					     jiffies,

7836

-+					     jiffies_to_msecs(bfqq->

7837

-+						raising_cur_max_time));

7838

-+			} else if (time_before(

7839

-+					bfqq->last_rais_start_finish +

7840

-+					bfqq->raising_cur_max_time,

7841

-+					jiffies +

7842

-+					bfqd->bfq_raising_rt_max_time) &&

7843

-+				   soft_rt) {

7844

-+				/*

7845

-+				 *

7846

-+				 * The remaining weight-raising time is lower

7847

-+				 * than bfqd->bfq_raising_rt_max_time, which

7848

-+				 * means that the application is enjoying

7849

-+				 * weight raising either because deemed soft rt

7850

-+				 * in the near past, or because deemed

7851

-+				 * interactive a long ago. In both cases,

7852

-+				 * resetting now the current remaining weight-

7853

-+				 * raising time for the application to the

7854

-+				 * weight-raising duration for soft rt

7855

-+				 * applications would not cause any latency

7856

-+				 * increase for the application (as the new

7857

-+				 * duration would be higher than the remaining

7858

-+				 * time).

7859

-+				 *

7860

-+				 * In addition, the application is now meeting

7861

-+				 * the requirements for being deemed soft rt.

7862

-+				 * In the end we can correctly and safely

7863

-+				 * (re)charge the weight-raising duration for

7864

-+				 * the application with the weight-raising

7865

-+				 * duration for soft rt applications.

7866

-+				 *

7867

-+				 * In particular, doing this recharge now, i.e.,

7868

-+				 * before the weight-raising period for the

7869

-+				 * application finishes, reduces the probability

7870

-+				 * of the following negative scenario:

7871

-+				 * 1) the weight of a soft rt application is

7872

-+				 *    raised at startup (as for any newly

7873

-+				 *    created application),

7874

-+				 * 2) since the application is not interactive,

7875

-+				 *    at a certain time weight-raising is

7876

-+				 *    stopped for the application,

7877

-+				 * 3) at that time the application happens to

7878

-+				 *    still have pending requests, and hence

7879

-+				 *    is destined to not have a chance to be

7880

-+				 *    deemed soft rt before these requests are

7881

-+				 *    completed (see the comments to the

7882

-+				 *    function bfq_bfqq_softrt_next_start()

7883

-+				 *    for details on soft rt detection),

7884

-+				 * 4) these pending requests experience a high

7885

-+				 *    latency because the application is not

7886

-+				 *    weight-raised while they are pending.

7887

-+				 */

7888

-+				bfqq->last_rais_start_finish = jiffies;

7889

-+				bfqq->raising_cur_max_time =

7890

-+					bfqd->bfq_raising_rt_max_time;

7891

-+			}

7892

-+		}

7893

-+		if (old_raising_coeff != bfqq->raising_coeff)

7894

-+			entity->ioprio_changed = 1;

7895

-+add_bfqq_busy:

7896

-+		bfqq->last_idle_bklogged = jiffies;

7897

-+		bfqq->service_from_backlogged = 0;

7898

-+		bfq_clear_bfqq_softrt_update(bfqq);

7899

-+		bfq_add_bfqq_busy(bfqd, bfqq);

7900

-+	} else {

7901

-+		if (bfqd->low_latency && old_raising_coeff == 1 &&

7902

-+			!rq_is_sync(rq) &&

7903

-+			time_is_before_jiffies(

7904

-+				bfqq->last_rais_start_finish +

7905

-+				bfqd->bfq_raising_min_inter_arr_async)) {

7906

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

7907

-+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

7908

-+

7909

-+			bfqd->raised_busy_queues++;

7910

-+			entity->ioprio_changed = 1;

7911

-+			bfq_log_bfqq(bfqd, bfqq,

7912

-+				     "non-idle wrais starting at %lu, "

7913

-+				     "rais_max_time %u",

7914

-+				     jiffies,

7915

-+				     jiffies_to_msecs(bfqq->

7916

-+					raising_cur_max_time));

7917

-+		}

7918

-+		bfq_updated_next_req(bfqd, bfqq);

7919

-+	}

7920

-+

7921

-+	if (bfqd->low_latency &&

7922

-+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

7923

-+		 idle_for_long_time))

7924

-+		bfqq->last_rais_start_finish = jiffies;

7925

-+}

7926

-+

7927

-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

7928

-+{

7929

-+	elv_rb_del(&bfqq->sort_list, rq);

7930

-+	bfqq->queued[rq_is_sync(rq)]--;

7931

-+	bfqq->bfqd->queued--;

7932

-+	bfq_add_rq_rb(rq);

7933

-+}

7934

-+

7935

-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

7936

-+					  struct bio *bio)

7937

-+{

7938

-+	struct task_struct *tsk = current;

7939

-+	struct bfq_io_cq *bic;

7940

-+	struct bfq_queue *bfqq;

7941

-+

7942

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

7943

-+	if (bic == NULL)

7944

-+		return NULL;

7945

-+

7946

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7947

-+	if (bfqq != NULL)

7948

-+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

7949

-+

7950

-+	return NULL;

7951

-+}

7952

-+

7953

-+static void bfq_activate_request(struct request_queue *q, struct request *rq)

7954

-+{

7955

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7956

-+

7957

-+	bfqd->rq_in_driver++;

7958

-+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

7959

-+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

7960

-+		(long long unsigned)bfqd->last_position);

7961

-+}

7962

-+

7963

-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

7964

-+{

7965

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7966

-+

7967

-+	WARN_ON(bfqd->rq_in_driver == 0);

7968

-+	bfqd->rq_in_driver--;

7969

-+}

7970

-+

7971

-+static void bfq_remove_request(struct request *rq)

7972

-+{

7973

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

7974

-+	struct bfq_data *bfqd = bfqq->bfqd;

7975

-+

7976

-+	if (bfqq->next_rq == rq) {

7977

-+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

7978

-+		bfq_updated_next_req(bfqd, bfqq);

7979

-+	}

7980

-+

7981

-+	list_del_init(&rq->queuelist);

7982

-+	bfq_del_rq_rb(rq);

7983

-+

7984

-+	if (rq->cmd_flags & REQ_META) {

7985

-+		WARN_ON(bfqq->meta_pending == 0);

7986

-+		bfqq->meta_pending--;

7987

-+	}

7988

-+}

7989

-+

7990

-+static int bfq_merge(struct request_queue *q, struct request **req,

7991

-+		     struct bio *bio)

7992

-+{

7993

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

7994

-+	struct request *__rq;

7995

-+

7996

-+	__rq = bfq_find_rq_fmerge(bfqd, bio);

7997

-+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

7998

-+		*req = __rq;

7999

-+		return ELEVATOR_FRONT_MERGE;

8000

-+	}

8001

-+

8002

-+	return ELEVATOR_NO_MERGE;

8003

-+}

8004

-+

8005

-+static void bfq_merged_request(struct request_queue *q, struct request *req,

8006

-+			       int type)

8007

-+{

8008

-+	if (type == ELEVATOR_FRONT_MERGE) {

8009

-+		struct bfq_queue *bfqq = RQ_BFQQ(req);

8010

-+

8011

-+		bfq_reposition_rq_rb(bfqq, req);

8012

-+	}

8013

-+}

8014

-+

8015

-+static void bfq_merged_requests(struct request_queue *q, struct request *rq,

8016

-+				struct request *next)

8017

-+{

8018

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8019

-+

8020

-+	/*

8021

-+	 * Reposition in fifo if next is older than rq.

8022

-+	 */

8023

-+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

8024

-+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

8025

-+		list_move(&rq->queuelist, &next->queuelist);

8026

-+		rq_set_fifo_time(rq, rq_fifo_time(next));

8027

-+	}

8028

-+

8029

-+	if (bfqq->next_rq == next)

8030

-+		bfqq->next_rq = rq;

8031

-+

8032

-+	bfq_remove_request(next);

8033

-+}

8034

-+

8035

-+/* Must be called with bfqq != NULL */

8036

-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

8037

-+{

8038

-+	BUG_ON(bfqq == NULL);

8039

-+	if (bfq_bfqq_busy(bfqq))

8040

-+		bfqq->bfqd->raised_busy_queues--;

8041

-+	bfqq->raising_coeff = 1;

8042

-+	bfqq->raising_cur_max_time = 0;

8043

-+	/* Trigger a weight change on the next activation of the queue */

8044

-+	bfqq->entity.ioprio_changed = 1;

8045

-+}

8046

-+

8047

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

8048

-+					struct bfq_group *bfqg)

8049

-+{

8050

-+	int i, j;

8051

-+

8052

-+	for (i = 0; i < 2; i++)

8053

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

8054

-+			if (bfqg->async_bfqq[i][j] != NULL)

8055

-+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

8056

-+	if (bfqg->async_idle_bfqq != NULL)

8057

-+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

8058

-+}

8059

-+

8060

-+static void bfq_end_raising(struct bfq_data *bfqd)

8061

-+{

8062

-+	struct bfq_queue *bfqq;

8063

-+

8064

-+	spin_lock_irq(bfqd->queue->queue_lock);

8065

-+

8066

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

8067

-+		bfq_bfqq_end_raising(bfqq);

8068

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

8069

-+		bfq_bfqq_end_raising(bfqq);

8070

-+	bfq_end_raising_async(bfqd);

8071

-+

8072

-+	spin_unlock_irq(bfqd->queue->queue_lock);

8073

-+}

8074

-+

8075

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

8076

-+			   struct bio *bio)

8077

-+{

8078

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

8079

-+	struct bfq_io_cq *bic;

8080

-+	struct bfq_queue *bfqq;

8081

-+

8082

-+	/*

8083

-+	 * Disallow merge of a sync bio into an async request.

8084

-+	 */

8085

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

8086

-+		return 0;

8087

-+

8088

-+	/*

8089

-+	 * Lookup the bfqq that this bio will be queued with. Allow

8090

-+	 * merge only if rq is queued there.

8091

-+	 * Queue lock is held here.

8092

-+	 */

8093

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

8094

-+	if (bic == NULL)

8095

-+		return 0;

8096

-+

8097

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

8098

-+	return bfqq == RQ_BFQQ(rq);

8099

-+}

8100

-+

8101

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

8102

-+				       struct bfq_queue *bfqq)

8103

-+{

8104

-+	if (bfqq != NULL) {

8105

-+		bfq_mark_bfqq_must_alloc(bfqq);

8106

-+		bfq_mark_bfqq_budget_new(bfqq);

8107

-+		bfq_clear_bfqq_fifo_expire(bfqq);

8108

-+

8109

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

8110

-+

8111

-+		bfq_log_bfqq(bfqd, bfqq,

8112

-+			     "set_in_service_queue, cur-budget = %lu",

8113

-+			     bfqq->entity.budget);

8114

-+	}

8115

-+

8116

-+	bfqd->in_service_queue = bfqq;

8117

-+}

8118

-+

8119

-+/*

8120

-+ * Get and set a new queue for service.

8121

-+ */

8122

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

8123

-+						  struct bfq_queue *bfqq)

8124

-+{

8125

-+	if (!bfqq)

8126

-+		bfqq = bfq_get_next_queue(bfqd);

8127

-+	else

8128

-+		bfq_get_next_queue_forced(bfqd, bfqq);

8129

-+

8130

-+	__bfq_set_in_service_queue(bfqd, bfqq);

8131

-+	return bfqq;

8132

-+}

8133

-+

8134

-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

8135

-+					  struct request *rq)

8136

-+{

8137

-+	if (blk_rq_pos(rq) >= bfqd->last_position)

8138

-+		return blk_rq_pos(rq) - bfqd->last_position;

8139

-+	else

8140

-+		return bfqd->last_position - blk_rq_pos(rq);

8141

-+}

8142

-+

8143

-+/*

8144

-+ * Return true if bfqq has no request pending and rq is close enough to

8145

-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than

8146

-+ * bfqq->next_rq

8147

-+ */

8148

-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

8149

-+{

8150

-+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

8151

-+}

8152

-+

8153

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

8154

-+{

8155

-+	struct rb_root *root = &bfqd->rq_pos_tree;

8156

-+	struct rb_node *parent, *node;

8157

-+	struct bfq_queue *__bfqq;

8158

-+	sector_t sector = bfqd->last_position;

8159

-+

8160

-+	if (RB_EMPTY_ROOT(root))

8161

-+		return NULL;

8162

-+

8163

-+	/*

8164

-+	 * First, if we find a request starting at the end of the last

8165

-+	 * request, choose it.

8166

-+	 */

8167

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

8168

-+	if (__bfqq != NULL)

8169

-+		return __bfqq;

8170

-+

8171

-+	/*

8172

-+	 * If the exact sector wasn't found, the parent of the NULL leaf

8173

-+	 * will contain the closest sector (rq_pos_tree sorted by next_request

8174

-+	 * position).

8175

-+	 */

8176

-+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

8177

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8178

-+		return __bfqq;

8179

-+

8180

-+	if (blk_rq_pos(__bfqq->next_rq) < sector)

8181

-+		node = rb_next(&__bfqq->pos_node);

8182

-+	else

8183

-+		node = rb_prev(&__bfqq->pos_node);

8184

-+	if (node == NULL)

8185

-+		return NULL;

8186

-+

8187

-+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

8188

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8189

-+		return __bfqq;

8190

-+

8191

-+	return NULL;

8192

-+}

8193

-+

8194

-+/*

8195

-+ * bfqd - obvious

8196

-+ * cur_bfqq - passed in so that we don't decide that the current queue

8197

-+ *            is closely cooperating with itself.

8198

-+ *

8199

-+ * We are assuming that cur_bfqq has dispatched at least one request,

8200

-+ * and that bfqd->last_position reflects a position on the disk associated

8201

-+ * with the I/O issued by cur_bfqq.

8202

-+ */

8203

-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

8204

-+					      struct bfq_queue *cur_bfqq)

8205

-+{

8206

-+	struct bfq_queue *bfqq;

8207

-+

8208

-+	if (bfq_class_idle(cur_bfqq))

8209

-+		return NULL;

8210

-+	if (!bfq_bfqq_sync(cur_bfqq))

8211

-+		return NULL;

8212

-+	if (BFQQ_SEEKY(cur_bfqq))

8213

-+		return NULL;

8214

-+

8215

-+	/* If device has only one backlogged bfq_queue, don't search. */

8216

-+	if (bfqd->busy_queues == 1)

8217

-+		return NULL;

8218

-+

8219

-+	/*

8220

-+	 * We should notice if some of the queues are cooperating, e.g.

8221

-+	 * working closely on the same area of the disk. In that case,

8222

-+	 * we can group them together and don't waste time idling.

8223

-+	 */

8224

-+	bfqq = bfqq_close(bfqd);

8225

-+	if (bfqq == NULL || bfqq == cur_bfqq)

8226

-+		return NULL;

8227

-+

8228

-+	/*

8229

-+	 * Do not merge queues from different bfq_groups.

8230

-+	*/

8231

-+	if (bfqq->entity.parent != cur_bfqq->entity.parent)

8232

-+		return NULL;

8233

-+

8234

-+	/*

8235

-+	 * It only makes sense to merge sync queues.

8236

-+	 */

8237

-+	if (!bfq_bfqq_sync(bfqq))

8238

-+		return NULL;

8239

-+	if (BFQQ_SEEKY(bfqq))

8240

-+		return NULL;

8241

-+

8242

-+	/*

8243

-+	 * Do not merge queues of different priority classes.

8244

-+	 */

8245

-+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

8246

-+		return NULL;

8247

-+

8248

-+	return bfqq;

8249

-+}

8250

-+

8251

-+/*

8252

-+ * If enough samples have been computed, return the current max budget

8253

-+ * stored in bfqd, which is dynamically updated according to the

8254

-+ * estimated disk peak rate; otherwise return the default max budget

8255

-+ */

8256

-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

8257

-+{

8258

-+	if (bfqd->budgets_assigned < 194)

8259

-+		return bfq_default_max_budget;

8260

-+	else

8261

-+		return bfqd->bfq_max_budget;

8262

-+}

8263

-+

8264

-+/*

8265

-+ * Return min budget, which is a fraction of the current or default

8266

-+ * max budget (trying with 1/32)

8267

-+ */

8268

-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

8269

-+{

8270

-+	if (bfqd->budgets_assigned < 194)

8271

-+		return bfq_default_max_budget / 32;

8272

-+	else

8273

-+		return bfqd->bfq_max_budget / 32;

8274

-+}

8275

-+

8276

-+/*

8277

-+ * Decides whether idling should be done for given device and

8278

-+ * given in-service queue.

8279

-+ */

8280

-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

8281

-+					   struct bfq_queue *in_service_bfqq)

8282

-+{

8283

-+	if (in_service_bfqq == NULL)

8284

-+		return false;

8285

-+	/*

8286

-+	 * If the device is non-rotational, and hence has no seek penalty,

8287

-+	 * disable idling; but do so only if:

8288

-+	 * - device does not support queuing, otherwise we still have

8289

-+	 *   a problem with sync vs async workloads;

8290

-+	 * - the queue is not weight-raised, to preserve guarantees.

8291

-+	 */

8292

-+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

8293

-+		in_service_bfqq->raising_coeff == 1);

8294

-+}

8295

-+

8296

-+static void bfq_arm_slice_timer(struct bfq_data *bfqd)

8297

-+{

8298

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

8299

-+	struct bfq_io_cq *bic;

8300

-+	unsigned long sl;

8301

-+

8302

-+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

8303

-+

8304

-+	/* Tasks have exited, don't wait. */

8305

-+	bic = bfqd->in_service_bic;

8306

-+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

8307

-+		return;

8308

-+

8309

-+	bfq_mark_bfqq_wait_request(bfqq);

8310

-+

8311

-+	/*

8312

-+	 * We don't want to idle for seeks, but we do want to allow

8313

-+	 * fair distribution of slice time for a process doing back-to-back

8314

-+	 * seeks. So allow a little bit of time for him to submit a new rq.

8315

-+	 *

8316

-+	 * To prevent processes with (partly) seeky workloads from

8317

-+	 * being too ill-treated, grant them a small fraction of the

8318

-+	 * assigned budget before reducing the waiting time to

8319

-+	 * BFQ_MIN_TT. This happened to help reduce latency.

8320

-+	 */

8321

-+	sl = bfqd->bfq_slice_idle;

8322

-+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

8323

-+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

8324

-+	    bfqq->raising_coeff == 1)

8325

-+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

8326

-+	else if (bfqq->raising_coeff > 1)

8327

-+		sl = sl * 3;

8328

-+	bfqd->last_idling_start = ktime_get();

8329

-+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

8330

-+	bfq_log(bfqd, "arm idle: %u/%u ms",

8331

-+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

8332

-+}

8333

-+

8334

-+/*

8335

-+ * Set the maximum time for the in-service queue to consume its

8336

-+ * budget. This prevents seeky processes from lowering the disk

8337

-+ * throughput (always guaranteed with a time slice scheme as in CFQ).

8338

-+ */

8339

-+static void bfq_set_budget_timeout(struct bfq_data *bfqd)

8340

-+{

8341

-+	struct bfq_queue *bfqq = bfqd->in_service_queue;

8342

-+	unsigned int timeout_coeff;

8343

-+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

8344

-+		timeout_coeff = 1;

8345

-+	else

8346

-+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

8347

-+

8348

-+	bfqd->last_budget_start = ktime_get();

8349

-+

8350

-+	bfq_clear_bfqq_budget_new(bfqq);

8351

-+	bfqq->budget_timeout = jiffies +

8352

-+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

8353

-+

8354

-+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

8355

-+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

8356

-+		timeout_coeff));

8357

-+}

8358

-+

8359

-+/*

8360

-+ * Move request from internal lists to the request queue dispatch list.

8361

-+ */

8362

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

8363

-+{

8364

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

8365

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8366

-+

8367

-+	bfq_remove_request(rq);

8368

-+	bfqq->dispatched++;

8369

-+	elv_dispatch_sort(q, rq);

8370

-+

8371

-+	if (bfq_bfqq_sync(bfqq))

8372

-+		bfqd->sync_flight++;

8373

-+}

8374

-+

8375

-+/*

8376

-+ * Return expired entry, or NULL to just start from scratch in rbtree.

8377

-+ */

8378

-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

8379

-+{

8380

-+	struct request *rq = NULL;

8381

-+

8382

-+	if (bfq_bfqq_fifo_expire(bfqq))

8383

-+		return NULL;

8384

-+

8385

-+	bfq_mark_bfqq_fifo_expire(bfqq);

8386

-+

8387

-+	if (list_empty(&bfqq->fifo))

8388

-+		return NULL;

8389

-+

8390

-+	rq = rq_entry_fifo(bfqq->fifo.next);

8391

-+

8392

-+	if (time_before(jiffies, rq_fifo_time(rq)))

8393

-+		return NULL;

8394

-+

8395

-+	return rq;

8396

-+}

8397

-+

8398

-+/*

8399

-+ * Must be called with the queue_lock held.

8400

-+ */

8401

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

8402

-+{

8403

-+	int process_refs, io_refs;

8404

-+

8405

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

8406

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

8407

-+	BUG_ON(process_refs < 0);

8408

-+	return process_refs;

8409

-+}

8410

-+

8411

-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

8412

-+{

8413

-+	int process_refs, new_process_refs;

8414

-+	struct bfq_queue *__bfqq;

8415

-+

8416

-+	/*

8417

-+	 * If there are no process references on the new_bfqq, then it is

8418

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

8419

-+	 * may have dropped their last reference (not just their last process

8420

-+	 * reference).

8421

-+	 */

8422

-+	if (!bfqq_process_refs(new_bfqq))

8423

-+		return;

8424

-+

8425

-+	/* Avoid a circular list and skip interim queue merges. */

8426

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

8427

-+		if (__bfqq == bfqq)

8428

-+			return;

8429

-+		new_bfqq = __bfqq;

8430

-+	}

8431

-+

8432

-+	process_refs = bfqq_process_refs(bfqq);

8433

-+	new_process_refs = bfqq_process_refs(new_bfqq);

8434

-+	/*

8435

-+	 * If the process for the bfqq has gone away, there is no

8436

-+	 * sense in merging the queues.

8437

-+	 */

8438

-+	if (process_refs == 0 || new_process_refs == 0)

8439

-+		return;

8440

-+

8441

-+	/*

8442

-+	 * Merge in the direction of the lesser amount of work.

8443

-+	 */

8444

-+	if (new_process_refs >= process_refs) {

8445

-+		bfqq->new_bfqq = new_bfqq;

8446

-+		atomic_add(process_refs, &new_bfqq->ref);

8447

-+	} else {

8448

-+		new_bfqq->new_bfqq = bfqq;

8449

-+		atomic_add(new_process_refs, &bfqq->ref);

8450

-+	}

8451

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

8452

-+		new_bfqq->pid);

8453

-+}

8454

-+

8455

-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

8456

-+{

8457

-+	struct bfq_entity *entity = &bfqq->entity;

8458

-+	return entity->budget - entity->service;

8459

-+}

8460

-+

8461

-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8462

-+{

8463

-+	BUG_ON(bfqq != bfqd->in_service_queue);

8464

-+

8465

-+	__bfq_bfqd_reset_in_service(bfqd);

8466

-+

8467

-+	/*

8468

-+	 * If this bfqq is shared between multiple processes, check

8469

-+	 * to make sure that those processes are still issuing I/Os

8470

-+	 * within the mean seek distance. If not, it may be time to

8471

-+	 * break the queues apart again.

8472

-+	 */

8473

-+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

8474

-+		bfq_mark_bfqq_split_coop(bfqq);

8475

-+

8476

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

8477

-+		/*

8478

-+		 * overloading budget_timeout field to store when

8479

-+		 * the queue remains with no backlog, used by

8480

-+		 * the weight-raising mechanism

8481

-+		 */

8482

-+		bfqq->budget_timeout = jiffies;

8483

-+		bfq_del_bfqq_busy(bfqd, bfqq, 1);

8484

-+	} else {

8485

-+		bfq_activate_bfqq(bfqd, bfqq);

8486

-+		/*

8487

-+		 * Resort priority tree of potential close cooperators.

8488

-+		 */

8489

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

8490

-+	}

8491

-+}

8492

-+

8493

-+/**

8494

-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

8495

-+ * @bfqd: device data.

8496

-+ * @bfqq: queue to update.

8497

-+ * @reason: reason for expiration.

8498

-+ *

8499

-+ * Handle the feedback on @bfqq budget.  See the body for detailed

8500

-+ * comments.

8501

-+ */

8502

-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

8503

-+				     struct bfq_queue *bfqq,

8504

-+				     enum bfqq_expiration reason)

8505

-+{

8506

-+	struct request *next_rq;

8507

-+	unsigned long budget, min_budget;

8508

-+

8509

-+	budget = bfqq->max_budget;

8510

-+	min_budget = bfq_min_budget(bfqd);

8511

-+

8512

-+	BUG_ON(bfqq != bfqd->in_service_queue);

8513

-+

8514

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

8515

-+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

8516

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

8517

-+		budget, bfq_min_budget(bfqd));

8518

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

8519

-+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

8520

-+

8521

-+	if (bfq_bfqq_sync(bfqq)) {

8522

-+		switch (reason) {

8523

-+		/*

8524

-+		 * Caveat: in all the following cases we trade latency

8525

-+		 * for throughput.

8526

-+		 */

8527

-+		case BFQ_BFQQ_TOO_IDLE:

8528

-+			/*

8529

-+			 * This is the only case where we may reduce

8530

-+			 * the budget: if there is no requets of the

8531

-+			 * process still waiting for completion, then

8532

-+			 * we assume (tentatively) that the timer has

8533

-+			 * expired because the batch of requests of

8534

-+			 * the process could have been served with a

8535

-+			 * smaller budget.  Hence, betting that

8536

-+			 * process will behave in the same way when it

8537

-+			 * becomes backlogged again, we reduce its

8538

-+			 * next budget.  As long as we guess right,

8539

-+			 * this budget cut reduces the latency

8540

-+			 * experienced by the process.

8541

-+			 *

8542

-+			 * However, if there are still outstanding

8543

-+			 * requests, then the process may have not yet

8544

-+			 * issued its next request just because it is

8545

-+			 * still waiting for the completion of some of

8546

-+			 * the still oustanding ones.  So in this

8547

-+			 * subcase we do not reduce its budget, on the

8548

-+			 * contrary we increase it to possibly boost

8549

-+			 * the throughput, as discussed in the

8550

-+			 * comments to the BUDGET_TIMEOUT case.

8551

-+			 */

8552

-+			if (bfqq->dispatched > 0) /* still oustanding reqs */

8553

-+				budget = min(budget * 2, bfqd->bfq_max_budget);

8554

-+			else {

8555

-+				if (budget > 5 * min_budget)

8556

-+					budget -= 4 * min_budget;

8557

-+				else

8558

-+					budget = min_budget;

8559

-+			}

8560

-+			break;

8561

-+		case BFQ_BFQQ_BUDGET_TIMEOUT:

8562

-+			/*

8563

-+			 * We double the budget here because: 1) it

8564

-+			 * gives the chance to boost the throughput if

8565

-+			 * this is not a seeky process (which may have

8566

-+			 * bumped into this timeout because of, e.g.,

8567

-+			 * ZBR), 2) together with charge_full_budget

8568

-+			 * it helps give seeky processes higher

8569

-+			 * timestamps, and hence be served less

8570

-+			 * frequently.

8571

-+			 */

8572

-+			budget = min(budget * 2, bfqd->bfq_max_budget);

8573

-+			break;

8574

-+		case BFQ_BFQQ_BUDGET_EXHAUSTED:

8575

-+			/*

8576

-+			 * The process still has backlog, and did not

8577

-+			 * let either the budget timeout or the disk

8578

-+			 * idling timeout expire. Hence it is not

8579

-+			 * seeky, has a short thinktime and may be

8580

-+			 * happy with a higher budget too. So

8581

-+			 * definitely increase the budget of this good

8582

-+			 * candidate to boost the disk throughput.

8583

-+			 */

8584

-+			budget = min(budget * 4, bfqd->bfq_max_budget);

8585

-+			break;

8586

-+		case BFQ_BFQQ_NO_MORE_REQUESTS:

8587

-+		       /*

8588

-+			* Leave the budget unchanged.

8589

-+			*/

8590

-+		default:

8591

-+			return;

8592

-+		}

8593

-+	} else /* async queue */

8594

-+	    /* async queues get always the maximum possible budget

8595

-+	     * (their ability to dispatch is limited by

8596

-+	     * @bfqd->bfq_max_budget_async_rq).

8597

-+	     */

8598

-+		budget = bfqd->bfq_max_budget;

8599

-+

8600

-+	bfqq->max_budget = budget;

8601

-+

8602

-+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

8603

-+	    bfqq->max_budget > bfqd->bfq_max_budget)

8604

-+		bfqq->max_budget = bfqd->bfq_max_budget;

8605

-+

8606

-+	/*

8607

-+	 * Make sure that we have enough budget for the next request.

8608

-+	 * Since the finish time of the bfqq must be kept in sync with

8609

-+	 * the budget, be sure to call __bfq_bfqq_expire() after the

8610

-+	 * update.

8611

-+	 */

8612

-+	next_rq = bfqq->next_rq;

8613

-+	if (next_rq != NULL)

8614

-+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

8615

-+					    bfq_serv_to_charge(next_rq, bfqq));

8616

-+	else

8617

-+		bfqq->entity.budget = bfqq->max_budget;

8618

-+

8619

-+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

8620

-+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

8621

-+			bfqq->entity.budget);

8622

-+}

8623

-+

8624

-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

8625

-+{

8626

-+	unsigned long max_budget;

8627

-+

8628

-+	/*

8629

-+	 * The max_budget calculated when autotuning is equal to the

8630

-+	 * amount of sectors transfered in timeout_sync at the

8631

-+	 * estimated peak rate.

8632

-+	 */

8633

-+	max_budget = (unsigned long)(peak_rate * 1000 *

8634

-+				     timeout >> BFQ_RATE_SHIFT);

8635

-+

8636

-+	return max_budget;

8637

-+}

8638

-+

8639

-+/*

8640

-+ * In addition to updating the peak rate, checks whether the process

8641

-+ * is "slow", and returns 1 if so. This slow flag is used, in addition

8642

-+ * to the budget timeout, to reduce the amount of service provided to

8643

-+ * seeky processes, and hence reduce their chances to lower the

8644

-+ * throughput. See the code for more details.

8645

-+ */

8646

-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8647

-+				int compensate, enum bfqq_expiration reason)

8648

-+{

8649

-+	u64 bw, usecs, expected, timeout;

8650

-+	ktime_t delta;

8651

-+	int update = 0;

8652

-+

8653

-+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

8654

-+		return 0;

8655

-+

8656

-+	if (compensate)

8657

-+		delta = bfqd->last_idling_start;

8658

-+	else

8659

-+		delta = ktime_get();

8660

-+	delta = ktime_sub(delta, bfqd->last_budget_start);

8661

-+	usecs = ktime_to_us(delta);

8662

-+

8663

-+	/* Don't trust short/unrealistic values. */

8664

-+	if (usecs < 100 || usecs >= LONG_MAX)

8665

-+		return 0;

8666

-+

8667

-+	/*

8668

-+	 * Calculate the bandwidth for the last slice.  We use a 64 bit

8669

-+	 * value to store the peak rate, in sectors per usec in fixed

8670

-+	 * point math.  We do so to have enough precision in the estimate

8671

-+	 * and to avoid overflows.

8672

-+	 */

8673

-+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

8674

-+	do_div(bw, (unsigned long)usecs);

8675

-+

8676

-+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

8677

-+

8678

-+	/*

8679

-+	 * Use only long (> 20ms) intervals to filter out spikes for

8680

-+	 * the peak rate estimation.

8681

-+	 */

8682

-+	if (usecs > 20000) {

8683

-+		if (bw > bfqd->peak_rate ||

8684

-+		   (!BFQQ_SEEKY(bfqq) &&

8685

-+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

8686

-+			bfq_log(bfqd, "measured bw =%llu", bw);

8687

-+			/*

8688

-+			 * To smooth oscillations use a low-pass filter with

8689

-+			 * alpha=7/8, i.e.,

8690

-+			 * new_rate = (7/8) * old_rate + (1/8) * bw

8691

-+			 */

8692

-+			do_div(bw, 8);

8693

-+			if (bw == 0)

8694

-+				return 0;

8695

-+			bfqd->peak_rate *= 7;

8696

-+			do_div(bfqd->peak_rate, 8);

8697

-+			bfqd->peak_rate += bw;

8698

-+			update = 1;

8699

-+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

8700

-+		}

8701

-+

8702

-+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

8703

-+

8704

-+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

8705

-+			bfqd->peak_rate_samples++;

8706

-+

8707

-+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

8708

-+		    update && bfqd->bfq_user_max_budget == 0) {

8709

-+			bfqd->bfq_max_budget =

8710

-+				bfq_calc_max_budget(bfqd->peak_rate, timeout);

8711

-+			bfq_log(bfqd, "new max_budget=%lu",

8712

-+				bfqd->bfq_max_budget);

8713

-+		}

8714

-+	}

8715

-+

8716

-+	/*

8717

-+	 * If the process has been served for a too short time

8718

-+	 * interval to let its possible sequential accesses prevail on

8719

-+	 * the initial seek time needed to move the disk head on the

8720

-+	 * first sector it requested, then give the process a chance

8721

-+	 * and for the moment return false.

8722

-+	 */

8723

-+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

8724

-+		return 0;

8725

-+

8726

-+	/*

8727

-+	 * A process is considered ``slow'' (i.e., seeky, so that we

8728

-+	 * cannot treat it fairly in the service domain, as it would

8729

-+	 * slow down too much the other processes) if, when a slice

8730

-+	 * ends for whatever reason, it has received service at a

8731

-+	 * rate that would not be high enough to complete the budget

8732

-+	 * before the budget timeout expiration.

8733

-+	 */

8734

-+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

8735

-+

8736

-+	/*

8737

-+	 * Caveat: processes doing IO in the slower disk zones will

8738

-+	 * tend to be slow(er) even if not seeky. And the estimated

8739

-+	 * peak rate will actually be an average over the disk

8740

-+	 * surface. Hence, to not be too harsh with unlucky processes,

8741

-+	 * we keep a budget/3 margin of safety before declaring a

8742

-+	 * process slow.

8743

-+	 */

8744

-+	return expected > (4 * bfqq->entity.budget) / 3;

8745

-+}

8746

-+

8747

-+/*

8748

-+ * To be deemed as soft real-time, an application must meet two requirements.

8749

-+ * The first is that the application must not require an average bandwidth

8750

-+ * higher than the approximate bandwidth required to playback or record a

8751

-+ * compressed high-definition video.

8752

-+ * The next function is invoked on the completion of the last request of a

8753

-+ * batch, to compute the next-start time instant, soft_rt_next_start, such

8754

-+ * that, if the next request of the application does not arrive before

8755

-+ * soft_rt_next_start, then the above requirement on the bandwidth is met.

8756

-+ *

8757

-+ * The second requirement is that the request pattern of the application is

8758

-+ * isochronous, i.e., that, after issuing a request or a batch of requests, the

8759

-+ * application stops for a while, then issues a new batch, and so on. For this

8760

-+ * reason the next function is invoked to compute soft_rt_next_start only for

8761

-+ * applications that meet this requirement, whereas soft_rt_next_start is set

8762

-+ * to infinity for applications that do not.

8763

-+ *

8764

-+ * Unfortunately, even a greedy application may happen to behave in an

8765

-+ * isochronous way if several processes are competing for the CPUs. In fact,

8766

-+ * in this scenario the application stops issuing requests while the CPUs are

8767

-+ * busy serving other processes, then restarts, then stops again for a while,

8768

-+ * and so on. In addition, if the disk achieves a low enough throughput with

8769

-+ * the request pattern issued by the application (e.g., because the request

8770

-+ * pattern is random and/or the device is slow), then the above bandwidth

8771

-+ * requirement may happen to be met too. To prevent such a greedy application

8772

-+ * to be deemed as soft real-time, a further rule is used in the computation

8773

-+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current

8774

-+ * time plus the maximum time for which the arrival of a request is waited

8775

-+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This

8776

-+ * filters out greedy applications, as the latter issue instead their next

8777

-+ * request as soon as possible after the last one has been completed (in

8778

-+ * contrast, when a batch of requests is completed, a soft real-time

8779

-+ * application spends some time processing data).

8780

-+ *

8781

-+ * Actually, the last filter may easily generate false positives if: only

8782

-+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or

8783

-+ * both the following two cases occur:

8784

-+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher

8785

-+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

8786

-+ *    HZ=100.

8787

-+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

8788

-+ *    for a while, then suddenly 'jump' by several units to recover the lost

8789

-+ *    increments. This seems to happen, e.g., inside virtual machines.

8790

-+ * To address this issue, we do not use as a reference time interval just

8791

-+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

8792

-+ * particular we add the minimum number of jiffies for which the filter seems

8793

-+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.

8794

-+ */

8795

-+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

8796

-+						       struct bfq_queue *bfqq)

8797

-+{

8798

-+	return max(bfqq->last_idle_bklogged +

8799

-+		   HZ * bfqq->service_from_backlogged /

8800

-+		   bfqd->bfq_raising_max_softrt_rate,

8801

-+		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

8802

-+}

8803

-+

8804

-+/*

8805

-+ * Largest-possible time instant such that, for as long as possible, the

8806

-+ * current time will be lower than this time instant according to the macro

8807

-+ * time_is_before_jiffies().

8808

-+ */

8809

-+static inline unsigned long bfq_infinity_from_now(unsigned long now)

8810

-+{

8811

-+	return now + ULONG_MAX / 2;

8812

-+}

8813

-+

8814

-+/**

8815

-+ * bfq_bfqq_expire - expire a queue.

8816

-+ * @bfqd: device owning the queue.

8817

-+ * @bfqq: the queue to expire.

8818

-+ * @compensate: if true, compensate for the time spent idling.

8819

-+ * @reason: the reason causing the expiration.

8820

-+ *

8821

-+ *

8822

-+ * If the process associated to the queue is slow (i.e., seeky), or in

8823

-+ * case of budget timeout, or, finally, if it is async, we

8824

-+ * artificially charge it an entire budget (independently of the

8825

-+ * actual service it received). As a consequence, the queue will get

8826

-+ * higher timestamps than the correct ones upon reactivation, and

8827

-+ * hence it will be rescheduled as if it had received more service

8828

-+ * than what it actually received. In the end, this class of processes

8829

-+ * will receive less service in proportion to how slowly they consume

8830

-+ * their budgets (and hence how seriously they tend to lower the

8831

-+ * throughput).

8832

-+ *

8833

-+ * In contrast, when a queue expires because it has been idling for

8834

-+ * too much or because it exhausted its budget, we do not touch the

8835

-+ * amount of service it has received. Hence when the queue will be

8836

-+ * reactivated and its timestamps updated, the latter will be in sync

8837

-+ * with the actual service received by the queue until expiration.

8838

-+ *

8839

-+ * Charging a full budget to the first type of queues and the exact

8840

-+ * service to the others has the effect of using the WF2Q+ policy to

8841

-+ * schedule the former on a timeslice basis, without violating the

8842

-+ * service domain guarantees of the latter.

8843

-+ */

8844

-+static void bfq_bfqq_expire(struct bfq_data *bfqd,

8845

-+			    struct bfq_queue *bfqq,

8846

-+			    int compensate,

8847

-+			    enum bfqq_expiration reason)

8848

-+{

8849

-+	int slow;

8850

-+	BUG_ON(bfqq != bfqd->in_service_queue);

8851

-+

8852

-+	/* Update disk peak rate for autotuning and check whether the

8853

-+	 * process is slow (see bfq_update_peak_rate).

8854

-+	 */

8855

-+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

8856

-+

8857

-+	/*

8858

-+	 * As above explained, 'punish' slow (i.e., seeky), timed-out

8859

-+	 * and async queues, to favor sequential sync workloads.

8860

-+	 *

8861

-+	 * Processes doing IO in the slower disk zones will tend to be

8862

-+	 * slow(er) even if not seeky. Hence, since the estimated peak

8863

-+	 * rate is actually an average over the disk surface, these

8864

-+	 * processes may timeout just for bad luck. To avoid punishing

8865

-+	 * them we do not charge a full budget to a process that

8866

-+	 * succeeded in consuming at least 2/3 of its budget.

8867

-+	 */

8868

-+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

8869

-+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

8870

-+		bfq_bfqq_charge_full_budget(bfqq);

8871

-+

8872

-+	bfqq->service_from_backlogged += bfqq->entity.service;

8873

-+

8874

-+	if (bfqd->low_latency && bfqq->raising_coeff == 1)

8875

-+		bfqq->last_rais_start_finish = jiffies;

8876

-+

8877

-+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&

8878

-+	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

8879

-+		/*

8880

-+		 * If we get here, then the request pattern is

8881

-+		 * isochronous (see the comments to the function

8882

-+		 * bfq_bfqq_softrt_next_start()). However, if the

8883

-+		 * queue still has in-flight requests, then it is

8884

-+		 * better to postpone the computation of next_start

8885

-+		 * to the next request completion. In fact, if we

8886

-+		 * computed it now, then the application might pass

8887

-+		 * the greedy-application filter improperly, because

8888

-+		 * the arrival of its next request may  happen to be

8889

-+		 * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)

8890

-+		 * not because the application is truly soft real-

8891

-+		 * time, but just because the application is currently

8892

-+		 * waiting for the completion of some request before

8893

-+		 * issuing, as quickly as possible, its next request.

8894

-+		 */

8895

-+		if (bfqq->dispatched > 0) {

8896

-+			/*

8897

-+			 * The application is still waiting for the

8898

-+			 * completion of one or more requests:

8899

-+			 * prevent it from possibly being incorrectly

8900

-+			 * deemed as soft real-time by setting its

8901

-+			 * soft_rt_next_start to infinity. In fact,

8902

-+			 * without this assignment, the application

8903

-+			 * would be incorrectly deemed as soft

8904

-+			 * real-time if:

8905

-+			 * 1) it issued a new request before the

8906

-+			 *    completion of all its in-flight

8907

-+			 *    requests, and

8908

-+			 * 2) at that time, its soft_rt_next_start

8909

-+			 *    happened to be in the past.

8910

-+			 */

8911

-+			bfqq->soft_rt_next_start =

8912

-+				bfq_infinity_from_now(jiffies);

8913

-+			bfq_mark_bfqq_softrt_update(bfqq);

8914

-+		} else

8915

-+			bfqq->soft_rt_next_start =

8916

-+				bfq_bfqq_softrt_next_start(bfqd, bfqq);

8917

-+	}

8918

-+

8919

-+	bfq_log_bfqq(bfqd, bfqq,

8920

-+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

8921

-+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

8922

-+

8923

-+	/* Increase, decrease or leave budget unchanged according to reason */

8924

-+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

8925

-+	__bfq_bfqq_expire(bfqd, bfqq);

8926

-+}

8927

-+

8928

-+/*

8929

-+ * Budget timeout is not implemented through a dedicated timer, but

8930

-+ * just checked on request arrivals and completions, as well as on

8931

-+ * idle timer expirations.

8932

-+ */

8933

-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

8934

-+{

8935

-+	if (bfq_bfqq_budget_new(bfqq))

8936

-+		return 0;

8937

-+

8938

-+	if (time_before(jiffies, bfqq->budget_timeout))

8939

-+		return 0;

8940

-+

8941

-+	return 1;

8942

-+}

8943

-+

8944

-+/*

8945

-+ * If we expire a queue that is waiting for the arrival of a new

8946

-+ * request, we may prevent the fictitious timestamp backshifting that

8947

-+ * allows the guarantees of the queue to be preserved (see [1] for

8948

-+ * this tricky aspect). Hence we return true only if this condition

8949

-+ * does not hold, or if the queue is slow enough to deserve only to be

8950

-+ * kicked off for preserving a high throughput.

8951

-+*/

8952

-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

8953

-+{

8954

-+	bfq_log_bfqq(bfqq->bfqd, bfqq,

8955

-+		"may_budget_timeout: wr %d left %d timeout %d",

8956

-+		bfq_bfqq_wait_request(bfqq),

8957

-+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

8958

-+		bfq_bfqq_budget_timeout(bfqq));

8959

-+

8960

-+	return (!bfq_bfqq_wait_request(bfqq) ||

8961

-+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

8962

-+		&&

8963

-+		bfq_bfqq_budget_timeout(bfqq);

8964

-+}

8965

-+

8966

-+/*

8967

-+ * For weight-raised queues issuing sync requests, idling is always performed,

8968

-+ * as this is instrumental in guaranteeing a high fraction of the throughput

8969

-+ * to these queues, and hence in guaranteeing a lower latency for their

8970

-+ * requests. See [1] for details.

8971

-+ *

8972

-+ * For non-weight-raised queues, idling is instead disabled if the device is

8973

-+ * NCQ-enabled and non-rotational, as this boosts the throughput on such

8974

-+ * devices.

8975

-+ */

8976

-+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

8977

-+{

8978

-+	struct bfq_data *bfqd = bfqq->bfqd;

8979

-+

8980

-+	return bfq_bfqq_sync(bfqq) && (

8981

-+		bfqq->raising_coeff > 1 ||

8982

-+		(bfq_bfqq_idle_window(bfqq) &&

8983

-+		 !(bfqd->hw_tag &&

8984

-+		   (blk_queue_nonrot(bfqd->queue) ||

8985

-+		 /*

8986

-+		  * If there are weight-raised busy queues, then do not idle

8987

-+		  * the disk for a sync non-weight-raised queue, and hence

8988

-+		  * expire the queue immediately if empty. Combined with the

8989

-+		  * timestamping rules of BFQ (see [1] for details), this

8990

-+		  * causes sync non-weight-raised queues to get a lower

8991

-+		  * fraction of the disk throughput, and hence reduces the rate

8992

-+		  * at which the processes associated to these queues ask for

8993

-+		  * requests from the request pool.

8994

-+		  *

8995

-+		  * This is beneficial for weight-raised processes, when the

8996

-+		  * system operates in request-pool saturation conditions

8997

-+		  * (e.g., in the presence of write hogs). In fact, if

8998

-+		  * non-weight-raised processes ask for requests at a lower

8999

-+		  * rate, then weight-raised processes have a higher

9000

-+		  * probability to get a request from the pool immediately

9001

-+		  * (or at least soon) when they need one. Hence they have a

9002

-+		  * higher probability to actually get a fraction of the disk

9003

-+		  * throughput proportional to their high weight. This is

9004

-+		  * especially true with NCQ-enabled drives, which enqueue

9005

-+		  * several requests in advance and further reorder

9006

-+		  * internally-queued requests.

9007

-+		  *

9008

-+		  * Mistreating non-weight-raised queues in the above-described

9009

-+		  * way, when there are busy weight-raised queues, seems to

9010

-+		  * mitigate starvation problems in the presence of heavy write

9011

-+		  * workloads and NCQ, and hence to guarantee a higher

9012

-+		  * application and system responsiveness in these hostile

9013

-+		  * scenarios.

9014

-+		  */

9015

-+		    bfqd->raised_busy_queues > 0)

9016

-+		  )

9017

-+		)

9018

-+	);

9019

-+}

9020

-+

9021

-+/*

9022

-+ * If the in-service queue is empty, but it is sync and either of the following

9023

-+ * conditions holds, then: 1) the queue must remain in service and cannot be

9024

-+ * expired, and 2) the disk must be idled to wait for the possible arrival

9025

-+ * of a new request for the queue. The conditions are:

9026

-+ * - the device is rotational and not performing NCQ, and the queue has its

9027

-+ *   idle window set (in this case, waiting for a new request for the queue

9028

-+ *   is likely to boost the disk throughput);

9029

-+ * - the queue is weight-raised (waiting for the request is necessary to

9030

-+ *   provide the queue with fairness and latency guarantees, see [1] for

9031

-+ *   details).

9032

-+ */

9033

-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

9034

-+{

9035

-+	struct bfq_data *bfqd = bfqq->bfqd;

9036

-+

9037

-+	return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

9038

-+		bfq_bfqq_must_not_expire(bfqq) &&

9039

-+		!bfq_queue_nonrot_noidle(bfqd, bfqq));

9040

-+}

9041

-+

9042

-+/*

9043

-+ * Select a queue for service.  If we have a current queue in service,

9044

-+ * check whether to continue servicing it, or retrieve and set a new one.

9045

-+ */

9046

-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

9047

-+{

9048

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

9049

-+	struct request *next_rq;

9050

-+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

9051

-+

9052

-+	bfqq = bfqd->in_service_queue;

9053

-+	if (bfqq == NULL)

9054

-+		goto new_queue;

9055

-+

9056

-+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

9057

-+

9058

-+	/*

9059

-+         * If another queue has a request waiting within our mean seek

9060

-+         * distance, let it run. The expire code will check for close

9061

-+         * cooperators and put the close queue at the front of the

9062

-+         * service tree. If possible, merge the expiring queue with the

9063

-+         * new bfqq.

9064

-+         */

9065

-+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

9066

-+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

9067

-+                bfq_setup_merge(bfqq, new_bfqq);

9068

-+

9069

-+	if (bfq_may_expire_for_budg_timeout(bfqq) &&

9070

-+	    !timer_pending(&bfqd->idle_slice_timer) &&

9071

-+	    !bfq_bfqq_must_idle(bfqq))

9072

-+		goto expire;

9073

-+

9074

-+	next_rq = bfqq->next_rq;

9075

-+	/*

9076

-+	 * If bfqq has requests queued and it has enough budget left to

9077

-+	 * serve them, keep the queue, otherwise expire it.

9078

-+	 */

9079

-+	if (next_rq != NULL) {

9080

-+		if (bfq_serv_to_charge(next_rq, bfqq) >

9081

-+			bfq_bfqq_budget_left(bfqq)) {

9082

-+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

9083

-+			goto expire;

9084

-+		} else {

9085

-+			/*

9086

-+			 * The idle timer may be pending because we may not

9087

-+			 * disable disk idling even when a new request arrives

9088

-+			 */

9089

-+			if (timer_pending(&bfqd->idle_slice_timer)) {

9090

-+				/*

9091

-+				 * If we get here: 1) at least a new request

9092

-+				 * has arrived but we have not disabled the

9093

-+				 * timer because the request was too small,

9094

-+				 * 2) then the block layer has unplugged the

9095

-+				 * device, causing the dispatch to be invoked.

9096

-+				 *

9097

-+				 * Since the device is unplugged, now the

9098

-+				 * requests are probably large enough to

9099

-+				 * provide a reasonable throughput.

9100

-+				 * So we disable idling.

9101

-+				 */

9102

-+				bfq_clear_bfqq_wait_request(bfqq);

9103

-+				del_timer(&bfqd->idle_slice_timer);

9104

-+			}

9105

-+			if (new_bfqq == NULL)

9106

-+				goto keep_queue;

9107

-+			else

9108

-+				goto expire;

9109

-+		}

9110

-+	}

9111

-+

9112

-+	/*

9113

-+	 * No requests pending.  If the in-service queue has no cooperator and

9114

-+	 * still has requests in flight (possibly waiting for a completion)

9115

-+	 * or is idling for a new request, then keep it.

9116

-+	 */

9117

-+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

9118

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

9119

-+		bfqq = NULL;

9120

-+		goto keep_queue;

9121

-+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

9122

-+		/*

9123

-+		 * Expiring the queue because there is a close cooperator,

9124

-+		 * cancel timer.

9125

-+		 */

9126

-+		bfq_clear_bfqq_wait_request(bfqq);

9127

-+		del_timer(&bfqd->idle_slice_timer);

9128

-+	}

9129

-+

9130

-+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

9131

-+expire:

9132

-+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

9133

-+new_queue:

9134

-+	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

9135

-+	bfq_log(bfqd, "select_queue: new queue %d returned",

9136

-+		bfqq != NULL ? bfqq->pid : 0);

9137

-+keep_queue:

9138

-+	return bfqq;

9139

-+}

9140

-+

9141

-+static void bfq_update_raising_data(struct bfq_data *bfqd,

9142

-+				    struct bfq_queue *bfqq)

9143

-+{

9144

-+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

9145

-+		struct bfq_entity *entity = &bfqq->entity;

9146

-+

9147

-+		bfq_log_bfqq(bfqd, bfqq,

9148

-+			"raising period dur %u/%u msec, "

9149

-+			"old raising coeff %u, w %d(%d)",

9150

-+			jiffies_to_msecs(jiffies -

9151

-+				bfqq->last_rais_start_finish),

9152

-+			jiffies_to_msecs(bfqq->raising_cur_max_time),

9153

-+			bfqq->raising_coeff,

9154

-+			bfqq->entity.weight, bfqq->entity.orig_weight);

9155

-+

9156

-+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

9157

-+			entity->orig_weight * bfqq->raising_coeff);

9158

-+		if (entity->ioprio_changed)

9159

-+			bfq_log_bfqq(bfqd, bfqq,

9160

-+			"WARN: pending prio change");

9161

-+		/*

9162

-+		 * If too much time has elapsed from the beginning

9163

-+		 * of this weight-raising, stop it.

9164

-+		 */

9165

-+		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

9166

-+					   bfqq->raising_cur_max_time)) {

9167

-+			bfqq->last_rais_start_finish = jiffies;

9168

-+			bfq_log_bfqq(bfqd, bfqq,

9169

-+				     "wrais ending at %lu, "

9170

-+				     "rais_max_time %u",

9171

-+				     bfqq->last_rais_start_finish,

9172

-+				     jiffies_to_msecs(bfqq->

9173

-+					raising_cur_max_time));

9174

-+			bfq_bfqq_end_raising(bfqq);

9175

-+			__bfq_entity_update_weight_prio(

9176

-+				bfq_entity_service_tree(entity),

9177

-+				entity);

9178

-+		}

9179

-+	}

9180

-+}

9181

-+

9182

-+/*

9183

-+ * Dispatch one request from bfqq, moving it to the request queue

9184

-+ * dispatch list.

9185

-+ */

9186

-+static int bfq_dispatch_request(struct bfq_data *bfqd,

9187

-+				struct bfq_queue *bfqq)

9188

-+{

9189

-+	int dispatched = 0;

9190

-+	struct request *rq;

9191

-+	unsigned long service_to_charge;

9192

-+

9193

-+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

9194

-+

9195

-+	/* Follow expired path, else get first next available. */

9196

-+	rq = bfq_check_fifo(bfqq);

9197

-+	if (rq == NULL)

9198

-+		rq = bfqq->next_rq;

9199

-+	service_to_charge = bfq_serv_to_charge(rq, bfqq);

9200

-+

9201

-+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

9202

-+		/*

9203

-+		 * This may happen if the next rq is chosen

9204

-+		 * in fifo order instead of sector order.

9205

-+		 * The budget is properly dimensioned

9206

-+		 * to be always sufficient to serve the next request

9207

-+		 * only if it is chosen in sector order. The reason is

9208

-+		 * that it would be quite inefficient and little useful

9209

-+		 * to always make sure that the budget is large enough

9210

-+		 * to serve even the possible next rq in fifo order.

9211

-+		 * In fact, requests are seldom served in fifo order.

9212

-+		 *

9213

-+		 * Expire the queue for budget exhaustion, and

9214

-+		 * make sure that the next act_budget is enough

9215

-+		 * to serve the next request, even if it comes

9216

-+		 * from the fifo expired path.

9217

-+		 */

9218

-+		bfqq->next_rq = rq;

9219

-+		/*

9220

-+		 * Since this dispatch is failed, make sure that

9221

-+		 * a new one will be performed

9222

-+		 */

9223

-+		if (!bfqd->rq_in_driver)

9224

-+			bfq_schedule_dispatch(bfqd);

9225

-+		goto expire;

9226

-+	}

9227

-+

9228

-+	/* Finally, insert request into driver dispatch list. */

9229

-+	bfq_bfqq_served(bfqq, service_to_charge);

9230

-+	bfq_dispatch_insert(bfqd->queue, rq);

9231

-+

9232

-+	bfq_update_raising_data(bfqd, bfqq);

9233

-+

9234

-+	bfq_log_bfqq(bfqd, bfqq,

9235

-+			"dispatched %u sec req (%llu), budg left %lu",

9236

-+			blk_rq_sectors(rq),

9237

-+			(long long unsigned)blk_rq_pos(rq),

9238

-+			bfq_bfqq_budget_left(bfqq));

9239

-+

9240

-+	dispatched++;

9241

-+

9242

-+	if (bfqd->in_service_bic == NULL) {

9243

-+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

9244

-+		bfqd->in_service_bic = RQ_BIC(rq);

9245

-+	}

9246

-+

9247

-+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

9248

-+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

9249

-+	    bfq_class_idle(bfqq)))

9250

-+		goto expire;

9251

-+

9252

-+	return dispatched;

9253

-+

9254

-+expire:

9255

-+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

9256

-+	return dispatched;

9257

-+}

9258

-+

9259

-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

9260

-+{

9261

-+	int dispatched = 0;

9262

-+

9263

-+	while (bfqq->next_rq != NULL) {

9264

-+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

9265

-+		dispatched++;

9266

-+	}

9267

-+

9268

-+	BUG_ON(!list_empty(&bfqq->fifo));

9269

-+	return dispatched;

9270

-+}

9271

-+

9272

-+/*

9273

-+ * Drain our current requests.  Used for barriers and when switching

9274

-+ * io schedulers on-the-fly.

9275

-+ */

9276

-+static int bfq_forced_dispatch(struct bfq_data *bfqd)

9277

-+{

9278

-+	struct bfq_queue *bfqq, *n;

9279

-+	struct bfq_service_tree *st;

9280

-+	int dispatched = 0;

9281

-+

9282

-+	bfqq = bfqd->in_service_queue;

9283

-+	if (bfqq != NULL)

9284

-+		__bfq_bfqq_expire(bfqd, bfqq);

9285

-+

9286

-+	/*

9287

-+	 * Loop through classes, and be careful to leave the scheduler

9288

-+	 * in a consistent state, as feedback mechanisms and vtime

9289

-+	 * updates cannot be disabled during the process.

9290

-+	 */

9291

-+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

9292

-+		st = bfq_entity_service_tree(&bfqq->entity);

9293

-+

9294

-+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

9295

-+		bfqq->max_budget = bfq_max_budget(bfqd);

9296

-+

9297

-+		bfq_forget_idle(st);

9298

-+	}

9299

-+

9300

-+	BUG_ON(bfqd->busy_queues != 0);

9301

-+

9302

-+	return dispatched;

9303

-+}

9304

-+

9305

-+static int bfq_dispatch_requests(struct request_queue *q, int force)

9306

-+{

9307

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9308

-+	struct bfq_queue *bfqq;

9309

-+	int max_dispatch;

9310

-+

9311

-+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

9312

-+	if (bfqd->busy_queues == 0)

9313

-+		return 0;

9314

-+

9315

-+	if (unlikely(force))

9316

-+		return bfq_forced_dispatch(bfqd);

9317

-+

9318

-+	bfqq = bfq_select_queue(bfqd);

9319

-+	if (bfqq == NULL)

9320

-+		return 0;

9321

-+

9322

-+	max_dispatch = bfqd->bfq_quantum;

9323

-+	if (bfq_class_idle(bfqq))

9324

-+		max_dispatch = 1;

9325

-+

9326

-+	if (!bfq_bfqq_sync(bfqq))

9327

-+		max_dispatch = bfqd->bfq_max_budget_async_rq;

9328

-+

9329

-+	if (bfqq->dispatched >= max_dispatch) {

9330

-+		if (bfqd->busy_queues > 1)

9331

-+			return 0;

9332

-+		if (bfqq->dispatched >= 4 * max_dispatch)

9333

-+			return 0;

9334

-+	}

9335

-+

9336

-+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

9337

-+		return 0;

9338

-+

9339

-+	bfq_clear_bfqq_wait_request(bfqq);

9340

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

9341

-+

9342

-+	if (!bfq_dispatch_request(bfqd, bfqq))

9343

-+		return 0;

9344

-+

9345

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

9346

-+			bfqq->pid, max_dispatch);

9347

-+

9348

-+	return 1;

9349

-+}

9350

-+

9351

-+/*

9352

-+ * Task holds one reference to the queue, dropped when task exits.  Each rq

9353

-+ * in-flight on this queue also holds a reference, dropped when rq is freed.

9354

-+ *

9355

-+ * Queue lock must be held here.

9356

-+ */

9357

-+static void bfq_put_queue(struct bfq_queue *bfqq)

9358

-+{

9359

-+	struct bfq_data *bfqd = bfqq->bfqd;

9360

-+

9361

-+	BUG_ON(atomic_read(&bfqq->ref) <= 0);

9362

-+

9363

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

9364

-+		     atomic_read(&bfqq->ref));

9365

-+	if (!atomic_dec_and_test(&bfqq->ref))

9366

-+		return;

9367

-+

9368

-+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

9369

-+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

9370

-+	BUG_ON(bfqq->entity.tree != NULL);

9371

-+	BUG_ON(bfq_bfqq_busy(bfqq));

9372

-+	BUG_ON(bfqd->in_service_queue == bfqq);

9373

-+

9374

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

9375

-+

9376

-+	kmem_cache_free(bfq_pool, bfqq);

9377

-+}

9378

-+

9379

-+static void bfq_put_cooperator(struct bfq_queue *bfqq)

9380

-+{

9381

-+	struct bfq_queue *__bfqq, *next;

9382

-+

9383

-+	/*

9384

-+	 * If this queue was scheduled to merge with another queue, be

9385

-+	 * sure to drop the reference taken on that queue (and others in

9386

-+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

9387

-+	 */

9388

-+	__bfqq = bfqq->new_bfqq;

9389

-+	while (__bfqq) {

9390

-+		if (__bfqq == bfqq) {

9391

-+			WARN(1, "bfqq->new_bfqq loop detected.\n");

9392

-+			break;

9393

-+		}

9394

-+		next = __bfqq->new_bfqq;

9395

-+		bfq_put_queue(__bfqq);

9396

-+		__bfqq = next;

9397

-+	}

9398

-+}

9399

-+

9400

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9401

-+{

9402

-+	if (bfqq == bfqd->in_service_queue) {

9403

-+		__bfq_bfqq_expire(bfqd, bfqq);

9404

-+		bfq_schedule_dispatch(bfqd);

9405

-+	}

9406

-+

9407

-+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

9408

-+		     atomic_read(&bfqq->ref));

9409

-+

9410

-+	bfq_put_cooperator(bfqq);

9411

-+

9412

-+	bfq_put_queue(bfqq);

9413

-+}

9414

-+

9415

-+static void bfq_init_icq(struct io_cq *icq)

9416

-+{

9417

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

9418

-+

9419

-+	bic->ttime.last_end_request = jiffies;

9420

-+}

9421

-+

9422

-+static void bfq_exit_icq(struct io_cq *icq)

9423

-+{

9424

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

9425

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

9426

-+

9427

-+	if (bic->bfqq[BLK_RW_ASYNC]) {

9428

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

9429

-+		bic->bfqq[BLK_RW_ASYNC] = NULL;

9430

-+	}

9431

-+

9432

-+	if (bic->bfqq[BLK_RW_SYNC]) {

9433

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

9434

-+		bic->bfqq[BLK_RW_SYNC] = NULL;

9435

-+	}

9436

-+}

9437

-+

9438

-+/*

9439

-+ * Update the entity prio values; note that the new values will not

9440

-+ * be used until the next (re)activation.

9441

-+ */

9442

-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

9443

-+{

9444

-+	struct task_struct *tsk = current;

9445

-+	int ioprio_class;

9446

-+

9447

-+	if (!bfq_bfqq_prio_changed(bfqq))

9448

-+		return;

9449

-+

9450

-+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9451

-+	switch (ioprio_class) {

9452

-+	default:

9453

-+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

9454

-+			"bfq: bad prio %x\n", ioprio_class);

9455

-+	case IOPRIO_CLASS_NONE:

9456

-+		/*

9457

-+		 * No prio set, inherit CPU scheduling settings.

9458

-+		 */

9459

-+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

9460

-+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

9461

-+		break;

9462

-+	case IOPRIO_CLASS_RT:

9463

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9464

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

9465

-+		break;

9466

-+	case IOPRIO_CLASS_BE:

9467

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9468

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

9469

-+		break;

9470

-+	case IOPRIO_CLASS_IDLE:

9471

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

9472

-+		bfqq->entity.new_ioprio = 7;

9473

-+		bfq_clear_bfqq_idle_window(bfqq);

9474

-+		break;

9475

-+	}

9476

-+

9477

-+	bfqq->entity.ioprio_changed = 1;

9478

-+

9479

-+	/*

9480

-+	 * Keep track of original prio settings in case we have to temporarily

9481

-+	 * elevate the priority of this queue.

9482

-+	 */

9483

-+	bfqq->org_ioprio = bfqq->entity.new_ioprio;

9484

-+	bfq_clear_bfqq_prio_changed(bfqq);

9485

-+}

9486

-+

9487

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic)

9488

-+{

9489

-+	struct bfq_data *bfqd;

9490

-+	struct bfq_queue *bfqq, *new_bfqq;

9491

-+	struct bfq_group *bfqg;

9492

-+	unsigned long uninitialized_var(flags);

9493

-+	int ioprio = bic->icq.ioc->ioprio;

9494

-+

9495

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

9496

-+				   &flags);

9497

-+	/*

9498

-+	 * This condition may trigger on a newly created bic, be sure to drop

9499

-+	 * the lock before returning.

9500

-+	 */

9501

-+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

9502

-+		goto out;

9503

-+

9504

-+	bfqq = bic->bfqq[BLK_RW_ASYNC];

9505

-+	if (bfqq != NULL) {

9506

-+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

9507

-+				    sched_data);

9508

-+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

9509

-+					 GFP_ATOMIC);

9510

-+		if (new_bfqq != NULL) {

9511

-+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

9512

-+			bfq_log_bfqq(bfqd, bfqq,

9513

-+				     "changed_ioprio: bfqq %p %d",

9514

-+				     bfqq, atomic_read(&bfqq->ref));

9515

-+			bfq_put_queue(bfqq);

9516

-+		}

9517

-+	}

9518

-+

9519

-+	bfqq = bic->bfqq[BLK_RW_SYNC];

9520

-+	if (bfqq != NULL)

9521

-+		bfq_mark_bfqq_prio_changed(bfqq);

9522

-+

9523

-+	bic->ioprio = ioprio;

9524

-+

9525

-+out:

9526

-+	bfq_put_bfqd_unlock(bfqd, &flags);

9527

-+}

9528

-+

9529

-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9530

-+			  pid_t pid, int is_sync)

9531

-+{

9532

-+	RB_CLEAR_NODE(&bfqq->entity.rb_node);

9533

-+	INIT_LIST_HEAD(&bfqq->fifo);

9534

-+

9535

-+	atomic_set(&bfqq->ref, 0);

9536

-+	bfqq->bfqd = bfqd;

9537

-+

9538

-+	bfq_mark_bfqq_prio_changed(bfqq);

9539

-+

9540

-+	if (is_sync) {

9541

-+		if (!bfq_class_idle(bfqq))

9542

-+			bfq_mark_bfqq_idle_window(bfqq);

9543

-+		bfq_mark_bfqq_sync(bfqq);

9544

-+	}

9545

-+

9546

-+	/* Tentative initial value to trade off between thr and lat */

9547

-+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

9548

-+	bfqq->pid = pid;

9549

-+

9550

-+	bfqq->raising_coeff = 1;

9551

-+	bfqq->last_rais_start_finish = 0;

9552

-+	/*

9553

-+	 * Set to the value for which bfqq will not be deemed as

9554

-+	 * soft rt when it becomes backlogged.

9555

-+	 */

9556

-+	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

9557

-+}

9558

-+

9559

-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

9560

-+					      struct bfq_group *bfqg,

9561

-+					      int is_sync,

9562

-+					      struct bfq_io_cq *bic,

9563

-+					      gfp_t gfp_mask)

9564

-+{

9565

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

9566

-+

9567

-+retry:

9568

-+	/* bic always exists here */

9569

-+	bfqq = bic_to_bfqq(bic, is_sync);

9570

-+

9571

-+	/*

9572

-+	 * Always try a new alloc if we fall back to the OOM bfqq

9573

-+	 * originally, since it should just be a temporary situation.

9574

-+	 */

9575

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

9576

-+		bfqq = NULL;

9577

-+		if (new_bfqq != NULL) {

9578

-+			bfqq = new_bfqq;

9579

-+			new_bfqq = NULL;

9580

-+		} else if (gfp_mask & __GFP_WAIT) {

9581

-+			spin_unlock_irq(bfqd->queue->queue_lock);

9582

-+			new_bfqq = kmem_cache_alloc_node(bfq_pool,

9583

-+					gfp_mask | __GFP_ZERO,

9584

-+					bfqd->queue->node);

9585

-+			spin_lock_irq(bfqd->queue->queue_lock);

9586

-+			if (new_bfqq != NULL)

9587

-+				goto retry;

9588

-+		} else {

9589

-+			bfqq = kmem_cache_alloc_node(bfq_pool,

9590

-+					gfp_mask | __GFP_ZERO,

9591

-+					bfqd->queue->node);

9592

-+		}

9593

-+

9594

-+		if (bfqq != NULL) {

9595

-+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

9596

-+			bfq_log_bfqq(bfqd, bfqq, "allocated");

9597

-+		} else {

9598

-+			bfqq = &bfqd->oom_bfqq;

9599

-+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

9600

-+		}

9601

-+

9602

-+		bfq_init_prio_data(bfqq, bic);

9603

-+		bfq_init_entity(&bfqq->entity, bfqg);

9604

-+	}

9605

-+

9606

-+	if (new_bfqq != NULL)

9607

-+		kmem_cache_free(bfq_pool, new_bfqq);

9608

-+

9609

-+	return bfqq;

9610

-+}

9611

-+

9612

-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

9613

-+					       struct bfq_group *bfqg,

9614

-+					       int ioprio_class, int ioprio)

9615

-+{

9616

-+	switch (ioprio_class) {

9617

-+	case IOPRIO_CLASS_RT:

9618

-+		return &bfqg->async_bfqq[0][ioprio];

9619

-+	case IOPRIO_CLASS_NONE:

9620

-+		ioprio = IOPRIO_NORM;

9621

-+		/* fall through */

9622

-+	case IOPRIO_CLASS_BE:

9623

-+		return &bfqg->async_bfqq[1][ioprio];

9624

-+	case IOPRIO_CLASS_IDLE:

9625

-+		return &bfqg->async_idle_bfqq;

9626

-+	default:

9627

-+		BUG();

9628

-+	}

9629

-+}

9630

-+

9631

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

9632

-+				       struct bfq_group *bfqg, int is_sync,

9633

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask)

9634

-+{

9635

-+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

9636

-+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

9637

-+	struct bfq_queue **async_bfqq = NULL;

9638

-+	struct bfq_queue *bfqq = NULL;

9639

-+

9640

-+	if (!is_sync) {

9641

-+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

9642

-+						  ioprio);

9643

-+		bfqq = *async_bfqq;

9644

-+	}

9645

-+

9646

-+	if (bfqq == NULL)

9647

-+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

9648

-+

9649

-+	/*

9650

-+	 * Pin the queue now that it's allocated, scheduler exit will prune it.

9651

-+	 */

9652

-+	if (!is_sync && *async_bfqq == NULL) {

9653

-+		atomic_inc(&bfqq->ref);

9654

-+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

9655

-+			     bfqq, atomic_read(&bfqq->ref));

9656

-+		*async_bfqq = bfqq;

9657

-+	}

9658

-+

9659

-+	atomic_inc(&bfqq->ref);

9660

-+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

9661

-+		     atomic_read(&bfqq->ref));

9662

-+	return bfqq;

9663

-+}

9664

-+

9665

-+static void bfq_update_io_thinktime(struct bfq_data *bfqd,

9666

-+				    struct bfq_io_cq *bic)

9667

-+{

9668

-+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

9669

-+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

9670

-+

9671

-+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

9672

-+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

9673

-+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

9674

-+				bic->ttime.ttime_samples;

9675

-+}

9676

-+

9677

-+static void bfq_update_io_seektime(struct bfq_data *bfqd,

9678

-+				   struct bfq_queue *bfqq,

9679

-+				   struct request *rq)

9680

-+{

9681

-+	sector_t sdist;

9682

-+	u64 total;

9683

-+

9684

-+	if (bfqq->last_request_pos < blk_rq_pos(rq))

9685

-+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

9686

-+	else

9687

-+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

9688

-+

9689

-+	/*

9690

-+	 * Don't allow the seek distance to get too large from the

9691

-+	 * odd fragment, pagein, etc.

9692

-+	 */

9693

-+	if (bfqq->seek_samples == 0) /* first request, not really a seek */

9694

-+		sdist = 0;

9695

-+	else if (bfqq->seek_samples <= 60) /* second & third seek */

9696

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

9697

-+	else

9698

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

9699

-+

9700

-+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

9701

-+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

9702

-+	total = bfqq->seek_total + (bfqq->seek_samples/2);

9703

-+	do_div(total, bfqq->seek_samples);

9704

-+	bfqq->seek_mean = (sector_t)total;

9705

-+

9706

-+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

9707

-+			(u64)bfqq->seek_mean);

9708

-+}

9709

-+

9710

-+/*

9711

-+ * Disable idle window if the process thinks too long or seeks so much that

9712

-+ * it doesn't matter.

9713

-+ */

9714

-+static void bfq_update_idle_window(struct bfq_data *bfqd,

9715

-+				   struct bfq_queue *bfqq,

9716

-+				   struct bfq_io_cq *bic)

9717

-+{

9718

-+	int enable_idle;

9719

-+

9720

-+	/* Don't idle for async or idle io prio class. */

9721

-+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

9722

-+		return;

9723

-+

9724

-+	enable_idle = bfq_bfqq_idle_window(bfqq);

9725

-+

9726

-+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

9727

-+	    bfqd->bfq_slice_idle == 0 ||

9728

-+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

9729

-+			bfqq->raising_coeff == 1))

9730

-+		enable_idle = 0;

9731

-+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

9732

-+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

9733

-+			bfqq->raising_coeff == 1)

9734

-+			enable_idle = 0;

9735

-+		else

9736

-+			enable_idle = 1;

9737

-+	}

9738

-+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

9739

-+		enable_idle);

9740

-+

9741

-+	if (enable_idle)

9742

-+		bfq_mark_bfqq_idle_window(bfqq);

9743

-+	else

9744

-+		bfq_clear_bfqq_idle_window(bfqq);

9745

-+}

9746

-+

9747

-+/*

9748

-+ * Called when a new fs request (rq) is added to bfqq.  Check if there's

9749

-+ * something we should do about it.

9750

-+ */

9751

-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9752

-+			    struct request *rq)

9753

-+{

9754

-+	struct bfq_io_cq *bic = RQ_BIC(rq);

9755

-+

9756

-+	if (rq->cmd_flags & REQ_META)

9757

-+		bfqq->meta_pending++;

9758

-+

9759

-+	bfq_update_io_thinktime(bfqd, bic);

9760

-+	bfq_update_io_seektime(bfqd, bfqq, rq);

9761

-+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

9762

-+	    !BFQQ_SEEKY(bfqq))

9763

-+		bfq_update_idle_window(bfqd, bfqq, bic);

9764

-+

9765

-+	bfq_log_bfqq(bfqd, bfqq,

9766

-+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

9767

-+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

9768

-+		     (long long unsigned)bfqq->seek_mean);

9769

-+

9770

-+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

9771

-+

9772

-+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

9773

-+		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

9774

-+				blk_rq_sectors(rq) < 32;

9775

-+		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

9776

-+

9777

-+		/*

9778

-+		 * There is just this request queued: if the request

9779

-+		 * is small and the queue is not to be expired, then

9780

-+		 * just exit.

9781

-+		 *

9782

-+		 * In this way, if the disk is being idled to wait for

9783

-+		 * a new request from the in-service queue, we avoid

9784

-+		 * unplugging the device and committing the disk to serve

9785

-+		 * just a small request. On the contrary, we wait for

9786

-+		 * the block layer to decide when to unplug the device:

9787

-+		 * hopefully, new requests will be merged to this one

9788

-+		 * quickly, then the device will be unplugged and

9789

-+		 * larger requests will be dispatched.

9790

-+		 */

9791

-+		if (small_req && !budget_timeout)

9792

-+			return;

9793

-+

9794

-+		/*

9795

-+		 * A large enough request arrived, or the queue is to

9796

-+		 * be expired: in both cases disk idling is to be

9797

-+		 * stopped, so clear wait_request flag and reset

9798

-+		 * timer.

9799

-+		 */

9800

-+		bfq_clear_bfqq_wait_request(bfqq);

9801

-+		del_timer(&bfqd->idle_slice_timer);

9802

-+

9803

-+		/*

9804

-+		 * The queue is not empty, because a new request just

9805

-+		 * arrived. Hence we can safely expire the queue, in

9806

-+		 * case of budget timeout, without risking that the

9807

-+		 * timestamps of the queue are not updated correctly.

9808

-+		 * See [1] for more details.

9809

-+		 */

9810

-+		if (budget_timeout)

9811

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

9812

-+

9813

-+		/*

9814

-+		 * Let the request rip immediately, or let a new queue be

9815

-+		 * selected if bfqq has just been expired.

9816

-+		 */

9817

-+		__blk_run_queue(bfqd->queue);

9818

-+	}

9819

-+}

9820

-+

9821

-+static void bfq_insert_request(struct request_queue *q, struct request *rq)

9822

-+{

9823

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9824

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9825

-+

9826

-+	assert_spin_locked(bfqd->queue->queue_lock);

9827

-+	bfq_init_prio_data(bfqq, RQ_BIC(rq));

9828

-+

9829

-+	bfq_add_rq_rb(rq);

9830

-+

9831

-+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

9832

-+	list_add_tail(&rq->queuelist, &bfqq->fifo);

9833

-+

9834

-+	bfq_rq_enqueued(bfqd, bfqq, rq);

9835

-+}

9836

-+

9837

-+static void bfq_update_hw_tag(struct bfq_data *bfqd)

9838

-+{

9839

-+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

9840

-+				     bfqd->rq_in_driver);

9841

-+

9842

-+	if (bfqd->hw_tag == 1)

9843

-+		return;

9844

-+

9845

-+	/*

9846

-+	 * This sample is valid if the number of outstanding requests

9847

-+	 * is large enough to allow a queueing behavior.  Note that the

9848

-+	 * sum is not exact, as it's not taking into account deactivated

9849

-+	 * requests.

9850

-+	 */

9851

-+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

9852

-+		return;

9853

-+

9854

-+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

9855

-+		return;

9856

-+

9857

-+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

9858

-+	bfqd->max_rq_in_driver = 0;

9859

-+	bfqd->hw_tag_samples = 0;

9860

-+}

9861

-+

9862

-+static void bfq_completed_request(struct request_queue *q, struct request *rq)

9863

-+{

9864

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9865

-+	struct bfq_data *bfqd = bfqq->bfqd;

9866

-+	const int sync = rq_is_sync(rq);

9867

-+

9868

-+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

9869

-+			blk_rq_sectors(rq), sync);

9870

-+

9871

-+	bfq_update_hw_tag(bfqd);

9872

-+

9873

-+	WARN_ON(!bfqd->rq_in_driver);

9874

-+	WARN_ON(!bfqq->dispatched);

9875

-+	bfqd->rq_in_driver--;

9876

-+	bfqq->dispatched--;

9877

-+

9878

-+	if (bfq_bfqq_sync(bfqq))

9879

-+		bfqd->sync_flight--;

9880

-+

9881

-+	if (sync)

9882

-+		RQ_BIC(rq)->ttime.last_end_request = jiffies;

9883

-+

9884

-+	/*

9885

-+	 * The computation of softrt_next_start was scheduled for the next

9886

-+	 * request completion: it is now time to compute it.

9887

-+	 */

9888

-+	if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))

9889

-+		bfqq->soft_rt_next_start =

9890

-+			bfq_bfqq_softrt_next_start(bfqd, bfqq);

9891

-+

9892

-+	/*

9893

-+	 * If this is the in-service queue, check if it needs to be expired,

9894

-+	 * or if we want to idle in case it has no pending requests.

9895

-+	 */

9896

-+	if (bfqd->in_service_queue == bfqq) {

9897

-+		if (bfq_bfqq_budget_new(bfqq))

9898

-+			bfq_set_budget_timeout(bfqd);

9899

-+

9900

-+		if (bfq_bfqq_must_idle(bfqq)) {

9901

-+			bfq_arm_slice_timer(bfqd);

9902

-+			goto out;

9903

-+		} else if (bfq_may_expire_for_budg_timeout(bfqq))

9904

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

9905

-+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

9906

-+			 (bfqq->dispatched == 0 ||

9907

-+			  !bfq_bfqq_must_not_expire(bfqq)))

9908

-+			bfq_bfqq_expire(bfqd, bfqq, 0,

9909

-+					BFQ_BFQQ_NO_MORE_REQUESTS);

9910

-+	}

9911

-+

9912

-+	if (!bfqd->rq_in_driver)

9913

-+		bfq_schedule_dispatch(bfqd);

9914

-+

9915

-+out:

9916

-+	return;

9917

-+}

9918

-+

9919

-+static inline int __bfq_may_queue(struct bfq_queue *bfqq)

9920

-+{

9921

-+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

9922

-+		bfq_clear_bfqq_must_alloc(bfqq);

9923

-+		return ELV_MQUEUE_MUST;

9924

-+	}

9925

-+

9926

-+	return ELV_MQUEUE_MAY;

9927

-+}

9928

-+

9929

-+static int bfq_may_queue(struct request_queue *q, int rw)

9930

-+{

9931

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

9932

-+	struct task_struct *tsk = current;

9933

-+	struct bfq_io_cq *bic;

9934

-+	struct bfq_queue *bfqq;

9935

-+

9936

-+	/*

9937

-+	 * Don't force setup of a queue from here, as a call to may_queue

9938

-+	 * does not necessarily imply that a request actually will be queued.

9939

-+	 * So just lookup a possibly existing queue, or return 'may queue'

9940

-+	 * if that fails.

9941

-+	 */

9942

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

9943

-+	if (bic == NULL)

9944

-+		return ELV_MQUEUE_MAY;

9945

-+

9946

-+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

9947

-+	if (bfqq != NULL) {

9948

-+		bfq_init_prio_data(bfqq, bic);

9949

-+

9950

-+		return __bfq_may_queue(bfqq);

9951

-+	}

9952

-+

9953

-+	return ELV_MQUEUE_MAY;

9954

-+}

9955

-+

9956

-+/*

9957

-+ * Queue lock held here.

9958

-+ */

9959

-+static void bfq_put_request(struct request *rq)

9960

-+{

9961

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9962

-+

9963

-+	if (bfqq != NULL) {

9964

-+		const int rw = rq_data_dir(rq);

9965

-+

9966

-+		BUG_ON(!bfqq->allocated[rw]);

9967

-+		bfqq->allocated[rw]--;

9968

-+

9969

-+		rq->elv.priv[0] = NULL;

9970

-+		rq->elv.priv[1] = NULL;

9971

-+

9972

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

9973

-+			     bfqq, atomic_read(&bfqq->ref));

9974

-+		bfq_put_queue(bfqq);

9975

-+	}

9976

-+}

9977

-+

9978

-+static struct bfq_queue *

9979

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

9980

-+		struct bfq_queue *bfqq)

9981

-+{

9982

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

9983

-+		(long unsigned)bfqq->new_bfqq->pid);

9984

-+	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

9985

-+	bfq_mark_bfqq_coop(bfqq->new_bfqq);

9986

-+	bfq_put_queue(bfqq);

9987

-+	return bic_to_bfqq(bic, 1);

9988

-+}

9989

-+

9990

-+/*

9991

-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

9992

-+ * was the last process referring to said bfqq.

9993

-+ */

9994

-+static struct bfq_queue *

9995

-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

9996

-+{

9997

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

9998

-+	if (bfqq_process_refs(bfqq) == 1) {

9999

-+		bfqq->pid = current->pid;

10000

-+		bfq_clear_bfqq_coop(bfqq);

10001

-+		bfq_clear_bfqq_split_coop(bfqq);

10002

-+		return bfqq;

10003

-+	}

10004

-+

10005

-+	bic_set_bfqq(bic, NULL, 1);

10006

-+

10007

-+	bfq_put_cooperator(bfqq);

10008

-+

10009

-+	bfq_put_queue(bfqq);

10010

-+	return NULL;

10011

-+}

10012

-+

10013

-+/*

10014

-+ * Allocate bfq data structures associated with this request.

10015

-+ */

10016

-+static int bfq_set_request(struct request_queue *q, struct request *rq,

10017

-+			   struct bio *bio, gfp_t gfp_mask)

10018

-+{

10019

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

10020

-+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

10021

-+	const int rw = rq_data_dir(rq);

10022

-+	const int is_sync = rq_is_sync(rq);

10023

-+	struct bfq_queue *bfqq;

10024

-+	struct bfq_group *bfqg;

10025

-+	unsigned long flags;

10026

-+

10027

-+	might_sleep_if(gfp_mask & __GFP_WAIT);

10028

-+

10029

-+	bfq_changed_ioprio(bic);

10030

-+

10031

-+	spin_lock_irqsave(q->queue_lock, flags);

10032

-+

10033

-+	if (bic == NULL)

10034

-+		goto queue_fail;

10035

-+

10036

-+	bfqg = bfq_bic_update_cgroup(bic);

10037

-+

10038

-+new_queue:

10039

-+	bfqq = bic_to_bfqq(bic, is_sync);

10040

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

10041

-+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

10042

-+		bic_set_bfqq(bic, bfqq, is_sync);

10043

-+	} else {

10044

-+		/*

10045

-+		 * If the queue was seeky for too long, break it apart.

10046

-+		 */

10047

-+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

10048

-+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

10049

-+			bfqq = bfq_split_bfqq(bic, bfqq);

10050

-+			if (!bfqq)

10051

-+				goto new_queue;

10052

-+		}

10053

-+

10054

-+		/*

10055

-+		 * Check to see if this queue is scheduled to merge with

10056

-+		 * another closely cooperating queue. The merging of queues

10057

-+		 * happens here as it must be done in process context.

10058

-+		 * The reference on new_bfqq was taken in merge_bfqqs.

10059

-+		 */

10060

-+		if (bfqq->new_bfqq != NULL)

10061

-+			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

10062

-+	}

10063

-+

10064

-+	bfqq->allocated[rw]++;

10065

-+	atomic_inc(&bfqq->ref);

10066

-+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

10067

-+		     atomic_read(&bfqq->ref));

10068

-+

10069

-+	rq->elv.priv[0] = bic;

10070

-+	rq->elv.priv[1] = bfqq;

10071

-+

10072

-+	spin_unlock_irqrestore(q->queue_lock, flags);

10073

-+

10074

-+	return 0;

10075

-+

10076

-+queue_fail:

10077

-+	bfq_schedule_dispatch(bfqd);

10078

-+	spin_unlock_irqrestore(q->queue_lock, flags);

10079

-+

10080

-+	return 1;

10081

-+}

10082

-+

10083

-+static void bfq_kick_queue(struct work_struct *work)

10084

-+{

10085

-+	struct bfq_data *bfqd =

10086

-+		container_of(work, struct bfq_data, unplug_work);

10087

-+	struct request_queue *q = bfqd->queue;

10088

-+

10089

-+	spin_lock_irq(q->queue_lock);

10090

-+	__blk_run_queue(q);

10091

-+	spin_unlock_irq(q->queue_lock);

10092

-+}

10093

-+

10094

-+/*

10095

-+ * Handler of the expiration of the timer running if the in-service queue

10096

-+ * is idling inside its time slice.

10097

-+ */

10098

-+static void bfq_idle_slice_timer(unsigned long data)

10099

-+{

10100

-+	struct bfq_data *bfqd = (struct bfq_data *)data;

10101

-+	struct bfq_queue *bfqq;

10102

-+	unsigned long flags;

10103

-+	enum bfqq_expiration reason;

10104

-+

10105

-+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

10106

-+

10107

-+	bfqq = bfqd->in_service_queue;

10108

-+	/*

10109

-+	 * Theoretical race here: the in-service queue can be NULL or different

10110

-+	 * from the queue that was idling if the timer handler spins on

10111

-+	 * the queue_lock and a new request arrives for the current

10112

-+	 * queue and there is a full dispatch cycle that changes the

10113

-+	 * in-service queue.  This can hardly happen, but in the worst case

10114

-+	 * we just expire a queue too early.

10115

-+	 */

10116

-+	if (bfqq != NULL) {

10117

-+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

10118

-+		if (bfq_bfqq_budget_timeout(bfqq))

10119

-+			/*

10120

-+			 * Also here the queue can be safely expired

10121

-+			 * for budget timeout without wasting

10122

-+			 * guarantees

10123

-+			 */

10124

-+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

10125

-+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

10126

-+			/*

10127

-+			 * The queue may not be empty upon timer expiration,

10128

-+			 * because we may not disable the timer when the first

10129

-+			 * request of the in-service queue arrives during

10130

-+			 * disk idling

10131

-+			 */

10132

-+			reason = BFQ_BFQQ_TOO_IDLE;

10133

-+		else

10134

-+			goto schedule_dispatch;

10135

-+

10136

-+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

10137

-+	}

10138

-+

10139

-+schedule_dispatch:

10140

-+	bfq_schedule_dispatch(bfqd);

10141

-+

10142

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

10143

-+}

10144

-+

10145

-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

10146

-+{

10147

-+	del_timer_sync(&bfqd->idle_slice_timer);

10148

-+	cancel_work_sync(&bfqd->unplug_work);

10149

-+}

10150

-+

10151

-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

10152

-+					struct bfq_queue **bfqq_ptr)

10153

-+{

10154

-+	struct bfq_group *root_group = bfqd->root_group;

10155

-+	struct bfq_queue *bfqq = *bfqq_ptr;

10156

-+

10157

-+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

10158

-+	if (bfqq != NULL) {

10159

-+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

10160

-+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

10161

-+			     bfqq, atomic_read(&bfqq->ref));

10162

-+		bfq_put_queue(bfqq);

10163

-+		*bfqq_ptr = NULL;

10164

-+	}

10165

-+}

10166

-+

10167

-+/*

10168

-+ * Release all the bfqg references to its async queues.  If we are

10169

-+ * deallocating the group these queues may still contain requests, so

10170

-+ * we reparent them to the root cgroup (i.e., the only one that will

10171

-+ * exist for sure untill all the requests on a device are gone).

10172

-+ */

10173

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

10174

-+{

10175

-+	int i, j;

10176

-+

10177

-+	for (i = 0; i < 2; i++)

10178

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

10179

-+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

10180

-+

10181

-+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

10182

-+}

10183

-+

10184

-+static void bfq_exit_queue(struct elevator_queue *e)

10185

-+{

10186

-+	struct bfq_data *bfqd = e->elevator_data;

10187

-+	struct request_queue *q = bfqd->queue;

10188

-+	struct bfq_queue *bfqq, *n;

10189

-+

10190

-+	bfq_shutdown_timer_wq(bfqd);

10191

-+

10192

-+	spin_lock_irq(q->queue_lock);

10193

-+

10194

-+	BUG_ON(bfqd->in_service_queue != NULL);

10195

-+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

10196

-+		bfq_deactivate_bfqq(bfqd, bfqq, 0);

10197

-+

10198

-+	bfq_disconnect_groups(bfqd);

10199

-+	spin_unlock_irq(q->queue_lock);

10200

-+

10201

-+	bfq_shutdown_timer_wq(bfqd);

10202

-+

10203

-+	synchronize_rcu();

10204

-+

10205

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

10206

-+

10207

-+	bfq_free_root_group(bfqd);

10208

-+	kfree(bfqd);

10209

-+}

10210

-+

10211

-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

10212

-+{

10213

-+	struct bfq_group *bfqg;

10214

-+	struct bfq_data *bfqd;

10215

-+	struct elevator_queue *eq;

10216

-+

10217

-+	eq = elevator_alloc(q, e);

10218

-+	if (eq == NULL)

10219

-+		return -ENOMEM;

10220

-+

10221

-+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

10222

-+	if (bfqd == NULL) {

10223

-+		kobject_put(&eq->kobj);

10224

-+		return -ENOMEM;

10225

-+	}

10226

-+	eq->elevator_data = bfqd;

10227

-+

10228

-+	/*

10229

-+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

10230

-+	 * Grab a permanent reference to it, so that the normal code flow

10231

-+	 * will not attempt to free it.

10232

-+	 */

10233

-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

10234

-+	atomic_inc(&bfqd->oom_bfqq.ref);

10235

-+

10236

-+	bfqd->queue = q;

10237

-+

10238

-+	spin_lock_irq(q->queue_lock);

10239

-+	q->elevator = eq;

10240

-+	spin_unlock_irq(q->queue_lock);

10241

-+

10242

-+	bfqg = bfq_alloc_root_group(bfqd, q->node);

10243

-+	if (bfqg == NULL) {

10244

-+		kfree(bfqd);

10245

-+		kobject_put(&eq->kobj);

10246

-+		return -ENOMEM;

10247

-+	}

10248

-+

10249

-+	bfqd->root_group = bfqg;

10250

-+

10251

-+	init_timer(&bfqd->idle_slice_timer);

10252

-+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

10253

-+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

10254

-+

10255

-+	bfqd->rq_pos_tree = RB_ROOT;

10256

-+

10257

-+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

10258

-+

10259

-+	INIT_LIST_HEAD(&bfqd->active_list);

10260

-+	INIT_LIST_HEAD(&bfqd->idle_list);

10261

-+

10262

-+	bfqd->hw_tag = -1;

10263

-+

10264

-+	bfqd->bfq_max_budget = bfq_default_max_budget;

10265

-+

10266

-+	bfqd->bfq_quantum = bfq_quantum;

10267

-+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

10268

-+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

10269

-+	bfqd->bfq_back_max = bfq_back_max;

10270

-+	bfqd->bfq_back_penalty = bfq_back_penalty;

10271

-+	bfqd->bfq_slice_idle = bfq_slice_idle;

10272

-+	bfqd->bfq_class_idle_last_service = 0;

10273

-+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

10274

-+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

10275

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

10276

-+

10277

-+	bfqd->low_latency = true;

10278

-+

10279

-+	bfqd->bfq_raising_coeff = 20;

10280

-+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

10281

-+	bfqd->bfq_raising_max_time = 0;

10282

-+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

10283

-+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

10284

-+	bfqd->bfq_raising_max_softrt_rate = 7000; /*

10285

-+						   * Approximate rate required

10286

-+						   * to playback or record a

10287

-+						   * high-definition compressed

10288

-+						   * video.

10289

-+						   */

10290

-+	bfqd->raised_busy_queues = 0;

10291

-+

10292

-+	/* Initially estimate the device's peak rate as the reference rate */

10293

-+	if (blk_queue_nonrot(bfqd->queue)) {

10294

-+		bfqd->RT_prod = R_nonrot * T_nonrot;

10295

-+		bfqd->peak_rate = R_nonrot;

10296

-+	} else {

10297

-+		bfqd->RT_prod = R_rot * T_rot;

10298

-+		bfqd->peak_rate = R_rot;

10299

-+	}

10300

-+

10301

-+	return 0;

10302

-+}

10303

-+

10304

-+static void bfq_slab_kill(void)

10305

-+{

10306

-+	if (bfq_pool != NULL)

10307

-+		kmem_cache_destroy(bfq_pool);

10308

-+}

10309

-+

10310

-+static int __init bfq_slab_setup(void)

10311

-+{

10312

-+	bfq_pool = KMEM_CACHE(bfq_queue, 0);

10313

-+	if (bfq_pool == NULL)

10314

-+		return -ENOMEM;

10315

-+	return 0;

10316

-+}

10317

-+

10318

-+static ssize_t bfq_var_show(unsigned int var, char *page)

10319

-+{

10320

-+	return sprintf(page, "%d\n", var);

10321

-+}

10322

-+

10323

-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

10324

-+{

10325

-+	unsigned long new_val;

10326

-+	int ret = kstrtoul(page, 10, &new_val);

10327

-+

10328

-+	if (ret == 0)

10329

-+		*var = new_val;

10330

-+

10331

-+	return count;

10332

-+}

10333

-+

10334

-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

10335

-+{

10336

-+	struct bfq_data *bfqd = e->elevator_data;

10337

-+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

10338

-+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

10339

-+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

10340

-+}

10341

-+

10342

-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

10343

-+{

10344

-+	struct bfq_queue *bfqq;

10345

-+	struct bfq_data *bfqd = e->elevator_data;

10346

-+	ssize_t num_char = 0;

10347

-+

10348

-+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

10349

-+			    bfqd->queued);

10350

-+

10351

-+	spin_lock_irq(bfqd->queue->queue_lock);

10352

-+

10353

-+	num_char += sprintf(page + num_char, "Active:\n");

10354

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

10355

-+	  num_char += sprintf(page + num_char,

10356

-+			      "pid%d: weight %hu, nr_queued %d %d,"

10357

-+			      " dur %d/%u\n",

10358

-+			      bfqq->pid,

10359

-+			      bfqq->entity.weight,

10360

-+			      bfqq->queued[0],

10361

-+			      bfqq->queued[1],

10362

-+			jiffies_to_msecs(jiffies -

10363

-+				bfqq->last_rais_start_finish),

10364

-+			jiffies_to_msecs(bfqq->raising_cur_max_time));

10365

-+	}

10366

-+

10367

-+	num_char += sprintf(page + num_char, "Idle:\n");

10368

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

10369

-+			num_char += sprintf(page + num_char,

10370

-+				"pid%d: weight %hu, dur %d/%u\n",

10371

-+				bfqq->pid,

10372

-+				bfqq->entity.weight,

10373

-+				jiffies_to_msecs(jiffies -

10374

-+					bfqq->last_rais_start_finish),

10375

-+				jiffies_to_msecs(bfqq->raising_cur_max_time));

10376

-+	}

10377

-+

10378

-+	spin_unlock_irq(bfqd->queue->queue_lock);

10379

-+

10380

-+	return num_char;

10381

-+}

10382

-+

10383

-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

10384

-+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

10385

-+{									\

10386

-+	struct bfq_data *bfqd = e->elevator_data;			\

10387

-+	unsigned int __data = __VAR;					\

10388

-+	if (__CONV)							\

10389

-+		__data = jiffies_to_msecs(__data);			\

10390

-+	return bfq_var_show(__data, (page));				\

10391

-+}

10392

-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

10393

-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

10394

-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

10395

-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

10396

-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

10397

-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

10398

-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

10399

-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

10400

-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

10401

-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

10402

-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

10403

-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

10404

-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

10405

-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

10406

-+	1);

10407

-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

10408

-+	bfqd->bfq_raising_min_inter_arr_async,

10409

-+	1);

10410

-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

10411

-+	bfqd->bfq_raising_max_softrt_rate, 0);

10412

-+#undef SHOW_FUNCTION

10413

-+

10414

-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

10415

-+static ssize_t								\

10416

-+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

10417

-+{									\

10418

-+	struct bfq_data *bfqd = e->elevator_data;			\

10419

-+	unsigned long uninitialized_var(__data);			\

10420

-+	int ret = bfq_var_store(&__data, (page), count);		\

10421

-+	if (__data < (MIN))						\

10422

-+		__data = (MIN);						\

10423

-+	else if (__data > (MAX))					\

10424

-+		__data = (MAX);						\

10425

-+	if (__CONV)							\

10426

-+		*(__PTR) = msecs_to_jiffies(__data);			\

10427

-+	else								\

10428

-+		*(__PTR) = __data;					\

10429

-+	return ret;							\

10430

-+}

10431

-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

10432

-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

10433

-+		INT_MAX, 1);

10434

-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

10435

-+		INT_MAX, 1);

10436

-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

10437

-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

10438

-+		INT_MAX, 0);

10439

-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

10440

-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

10441

-+		1, INT_MAX, 0);

10442

-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

10443

-+		INT_MAX, 1);

10444

-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

10445

-+		INT_MAX, 0);

10446

-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

10447

-+		INT_MAX, 1);

10448

-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

10449

-+		INT_MAX, 1);

10450

-+STORE_FUNCTION(bfq_raising_min_idle_time_store,

10451

-+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

10452

-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

10453

-+		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

10454

-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

10455

-+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

10456

-+#undef STORE_FUNCTION

10457

-+

10458

-+/* do nothing for the moment */

10459

-+static ssize_t bfq_weights_store(struct elevator_queue *e,

10460

-+				    const char *page, size_t count)

10461

-+{

10462

-+	return count;

10463

-+}

10464

-+

10465

-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

10466

-+{

10467

-+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

10468

-+

10469

-+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

10470

-+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

10471

-+	else

10472

-+		return bfq_default_max_budget;

10473

-+}

10474

-+

10475

-+static ssize_t bfq_max_budget_store(struct elevator_queue *e,

10476

-+				    const char *page, size_t count)

10477

-+{

10478

-+	struct bfq_data *bfqd = e->elevator_data;

10479

-+	unsigned long uninitialized_var(__data);

10480

-+	int ret = bfq_var_store(&__data, (page), count);

10481

-+

10482

-+	if (__data == 0)

10483

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10484

-+	else {

10485

-+		if (__data > INT_MAX)

10486

-+			__data = INT_MAX;

10487

-+		bfqd->bfq_max_budget = __data;

10488

-+	}

10489

-+

10490

-+	bfqd->bfq_user_max_budget = __data;

10491

-+

10492

-+	return ret;

10493

-+}

10494

-+

10495

-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

10496

-+				      const char *page, size_t count)

10497

-+{

10498

-+	struct bfq_data *bfqd = e->elevator_data;

10499

-+	unsigned long uninitialized_var(__data);

10500

-+	int ret = bfq_var_store(&__data, (page), count);

10501

-+

10502

-+	if (__data < 1)

10503

-+		__data = 1;

10504

-+	else if (__data > INT_MAX)

10505

-+		__data = INT_MAX;

10506

-+

10507

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

10508

-+	if (bfqd->bfq_user_max_budget == 0)

10509

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

10510

-+

10511

-+	return ret;

10512

-+}

10513

-+

10514

-+static ssize_t bfq_low_latency_store(struct elevator_queue *e,

10515

-+				     const char *page, size_t count)

10516

-+{

10517

-+	struct bfq_data *bfqd = e->elevator_data;

10518

-+	unsigned long uninitialized_var(__data);

10519

-+	int ret = bfq_var_store(&__data, (page), count);

10520

-+

10521

-+	if (__data > 1)

10522

-+		__data = 1;

10523

-+	if (__data == 0 && bfqd->low_latency != 0)

10524

-+		bfq_end_raising(bfqd);

10525

-+	bfqd->low_latency = __data;

10526

-+

10527

-+	return ret;

10528

-+}

10529

-+

10530

-+#define BFQ_ATTR(name) \

10531

-+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

10532

-+

10533

-+static struct elv_fs_entry bfq_attrs[] = {

10534

-+	BFQ_ATTR(quantum),

10535

-+	BFQ_ATTR(fifo_expire_sync),

10536

-+	BFQ_ATTR(fifo_expire_async),

10537

-+	BFQ_ATTR(back_seek_max),

10538

-+	BFQ_ATTR(back_seek_penalty),

10539

-+	BFQ_ATTR(slice_idle),

10540

-+	BFQ_ATTR(max_budget),

10541

-+	BFQ_ATTR(max_budget_async_rq),

10542

-+	BFQ_ATTR(timeout_sync),

10543

-+	BFQ_ATTR(timeout_async),

10544

-+	BFQ_ATTR(low_latency),

10545

-+	BFQ_ATTR(raising_coeff),

10546

-+	BFQ_ATTR(raising_max_time),

10547

-+	BFQ_ATTR(raising_rt_max_time),

10548

-+	BFQ_ATTR(raising_min_idle_time),

10549

-+	BFQ_ATTR(raising_min_inter_arr_async),

10550

-+	BFQ_ATTR(raising_max_softrt_rate),

10551

-+	BFQ_ATTR(weights),

10552

-+	__ATTR_NULL

10553

-+};

10554

-+

10555

-+static struct elevator_type iosched_bfq = {

10556

-+	.ops = {

10557

-+		.elevator_merge_fn =		bfq_merge,

10558

-+		.elevator_merged_fn =		bfq_merged_request,

10559

-+		.elevator_merge_req_fn =	bfq_merged_requests,

10560

-+		.elevator_allow_merge_fn =	bfq_allow_merge,

10561

-+		.elevator_dispatch_fn =		bfq_dispatch_requests,

10562

-+		.elevator_add_req_fn =		bfq_insert_request,

10563

-+		.elevator_activate_req_fn =	bfq_activate_request,

10564

-+		.elevator_deactivate_req_fn =	bfq_deactivate_request,

10565

-+		.elevator_completed_req_fn =	bfq_completed_request,

10566

-+		.elevator_former_req_fn =	elv_rb_former_request,

10567

-+		.elevator_latter_req_fn =	elv_rb_latter_request,

10568

-+		.elevator_init_icq_fn =		bfq_init_icq,

10569

-+		.elevator_exit_icq_fn =		bfq_exit_icq,

10570

-+		.elevator_set_req_fn =		bfq_set_request,

10571

-+		.elevator_put_req_fn =		bfq_put_request,

10572

-+		.elevator_may_queue_fn =	bfq_may_queue,

10573

-+		.elevator_init_fn =		bfq_init_queue,

10574

-+		.elevator_exit_fn =		bfq_exit_queue,

10575

-+	},

10576

-+	.icq_size =		sizeof(struct bfq_io_cq),

10577

-+	.icq_align =		__alignof__(struct bfq_io_cq),

10578

-+	.elevator_attrs =	bfq_attrs,

10579

-+	.elevator_name =	"bfq",

10580

-+	.elevator_owner =	THIS_MODULE,

10581

-+};

10582

-+

10583

-+static int __init bfq_init(void)

10584

-+{

10585

-+	/*

10586

-+	 * Can be 0 on HZ < 1000 setups.

10587

-+	 */

10588

-+	if (bfq_slice_idle == 0)

10589

-+		bfq_slice_idle = 1;

10590

-+

10591

-+	if (bfq_timeout_async == 0)

10592

-+		bfq_timeout_async = 1;

10593

-+

10594

-+	if (bfq_slab_setup())

10595

-+		return -ENOMEM;

10596

-+

10597

-+	elv_register(&iosched_bfq);

10598

-+	printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");

10599

-+

10600

-+	return 0;

10601

-+}

10602

-+

10603

-+static void __exit bfq_exit(void)

10604

-+{

10605

-+	elv_unregister(&iosched_bfq);

10606

-+	bfq_slab_kill();

10607

-+}

10608

-+

10609

-+module_init(bfq_init);

10610

-+module_exit(bfq_exit);

10611

-+

10612

-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

10613

-+MODULE_LICENSE("GPL");

10614

-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

10615

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

10616

-new file mode 100644

10617

-index 0000000..999b475

10618

---- /dev/null

10619

-+++ b/block/bfq-sched.c

10620

-@@ -0,0 +1,1078 @@

10621

-+/*

10622

-+ * BFQ: Hierarchical B-WF2Q+ scheduler.

10623

-+ *

10624

-+ * Based on ideas and code from CFQ:

10625

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

10626

-+ *

10627

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

10628

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

10629

-+ *

10630

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

10631

-+ */

10632

-+

10633

-+#ifdef CONFIG_CGROUP_BFQIO

10634

-+#define for_each_entity(entity)	\

10635

-+	for (; entity != NULL; entity = entity->parent)

10636

-+

10637

-+#define for_each_entity_safe(entity, parent) \

10638

-+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

10639

-+

10640

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

10641

-+						 int extract,

10642

-+						 struct bfq_data *bfqd);

10643

-+

10644

-+static inline void bfq_update_budget(struct bfq_entity *next_in_service)

10645

-+{

10646

-+	struct bfq_entity *bfqg_entity;

10647

-+	struct bfq_group *bfqg;

10648

-+	struct bfq_sched_data *group_sd;

10649

-+

10650

-+	BUG_ON(next_in_service == NULL);

10651

-+

10652

-+	group_sd = next_in_service->sched_data;

10653

-+

10654

-+	bfqg = container_of(group_sd, struct bfq_group, sched_data);

10655

-+	/*

10656

-+	 * bfq_group's my_entity field is not NULL only if the group

10657

-+	 * is not the root group. We must not touch the root entity

10658

-+	 * as it must never become an in-service entity.

10659

-+	 */

10660

-+	bfqg_entity = bfqg->my_entity;

10661

-+	if (bfqg_entity != NULL)

10662

-+		bfqg_entity->budget = next_in_service->budget;

10663

-+}

10664

-+

10665

-+static int bfq_update_next_in_service(struct bfq_sched_data *sd)

10666

-+{

10667

-+	struct bfq_entity *next_in_service;

10668

-+

10669

-+	if (sd->in_service_entity != NULL)

10670

-+		/* will update/requeue at the end of service */

10671

-+		return 0;

10672

-+

10673

-+	/*

10674

-+	 * NOTE: this can be improved in many ways, such as returning

10675

-+	 * 1 (and thus propagating upwards the update) only when the

10676

-+	 * budget changes, or caching the bfqq that will be scheduled

10677

-+	 * next from this subtree.  By now we worry more about

10678

-+	 * correctness than about performance...

10679

-+	 */

10680

-+	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

10681

-+	sd->next_in_service = next_in_service;

10682

-+

10683

-+	if (next_in_service != NULL)

10684

-+		bfq_update_budget(next_in_service);

10685

-+

10686

-+	return 1;

10687

-+}

10688

-+

10689

-+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

10690

-+					     struct bfq_entity *entity)

10691

-+{

10692

-+	BUG_ON(sd->next_in_service != entity);

10693

-+}

10694

-+#else

10695

-+#define for_each_entity(entity)	\

10696

-+	for (; entity != NULL; entity = NULL)

10697

-+

10698

-+#define for_each_entity_safe(entity, parent) \

10699

-+	for (parent = NULL; entity != NULL; entity = parent)

10700

-+

10701

-+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

10702

-+{

10703

-+	return 0;

10704

-+}

10705

-+

10706

-+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

10707

-+					     struct bfq_entity *entity)

10708

-+{

10709

-+}

10710

-+

10711

-+static inline void bfq_update_budget(struct bfq_entity *next_in_service)

10712

-+{

10713

-+}

10714

-+#endif

10715

-+

10716

-+/*

10717

-+ * Shift for timestamp calculations.  This actually limits the maximum

10718

-+ * service allowed in one timestamp delta (small shift values increase it),

10719

-+ * the maximum total weight that can be used for the queues in the system

10720

-+ * (big shift values increase it), and the period of virtual time wraparounds.

10721

-+ */

10722

-+#define WFQ_SERVICE_SHIFT	22

10723

-+

10724

-+/**

10725

-+ * bfq_gt - compare two timestamps.

10726

-+ * @a: first ts.

10727

-+ * @b: second ts.

10728

-+ *

10729

-+ * Return @a > @b, dealing with wrapping correctly.

10730

-+ */

10731

-+static inline int bfq_gt(u64 a, u64 b)

10732

-+{

10733

-+	return (s64)(a - b) > 0;

10734

-+}

10735

-+

10736

-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

10737

-+{

10738

-+	struct bfq_queue *bfqq = NULL;

10739

-+

10740

-+	BUG_ON(entity == NULL);

10741

-+

10742

-+	if (entity->my_sched_data == NULL)

10743

-+		bfqq = container_of(entity, struct bfq_queue, entity);

10744

-+

10745

-+	return bfqq;

10746

-+}

10747

-+

10748

-+

10749

-+/**

10750

-+ * bfq_delta - map service into the virtual time domain.

10751

-+ * @service: amount of service.

10752

-+ * @weight: scale factor (weight of an entity or weight sum).

10753

-+ */

10754

-+static inline u64 bfq_delta(unsigned long service,

10755

-+					unsigned long weight)

10756

-+{

10757

-+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

10758

-+

10759

-+	do_div(d, weight);

10760

-+	return d;

10761

-+}

10762

-+

10763

-+/**

10764

-+ * bfq_calc_finish - assign the finish time to an entity.

10765

-+ * @entity: the entity to act upon.

10766

-+ * @service: the service to be charged to the entity.

10767

-+ */

10768

-+static inline void bfq_calc_finish(struct bfq_entity *entity,

10769

-+				   unsigned long service)

10770

-+{

10771

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10772

-+

10773

-+	BUG_ON(entity->weight == 0);

10774

-+

10775

-+	entity->finish = entity->start +

10776

-+		bfq_delta(service, entity->weight);

10777

-+

10778

-+	if (bfqq != NULL) {

10779

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

10780

-+			"calc_finish: serv %lu, w %d",

10781

-+			service, entity->weight);

10782

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

10783

-+			"calc_finish: start %llu, finish %llu, delta %llu",

10784

-+			entity->start, entity->finish,

10785

-+			bfq_delta(service, entity->weight));

10786

-+	}

10787

-+}

10788

-+

10789

-+/**

10790

-+ * bfq_entity_of - get an entity from a node.

10791

-+ * @node: the node field of the entity.

10792

-+ *

10793

-+ * Convert a node pointer to the relative entity.  This is used only

10794

-+ * to simplify the logic of some functions and not as the generic

10795

-+ * conversion mechanism because, e.g., in the tree walking functions,

10796

-+ * the check for a %NULL value would be redundant.

10797

-+ */

10798

-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

10799

-+{

10800

-+	struct bfq_entity *entity = NULL;

10801

-+

10802

-+	if (node != NULL)

10803

-+		entity = rb_entry(node, struct bfq_entity, rb_node);

10804

-+

10805

-+	return entity;

10806

-+}

10807

-+

10808

-+/**

10809

-+ * bfq_extract - remove an entity from a tree.

10810

-+ * @root: the tree root.

10811

-+ * @entity: the entity to remove.

10812

-+ */

10813

-+static inline void bfq_extract(struct rb_root *root,

10814

-+			       struct bfq_entity *entity)

10815

-+{

10816

-+	BUG_ON(entity->tree != root);

10817

-+

10818

-+	entity->tree = NULL;

10819

-+	rb_erase(&entity->rb_node, root);

10820

-+}

10821

-+

10822

-+/**

10823

-+ * bfq_idle_extract - extract an entity from the idle tree.

10824

-+ * @st: the service tree of the owning @entity.

10825

-+ * @entity: the entity being removed.

10826

-+ */

10827

-+static void bfq_idle_extract(struct bfq_service_tree *st,

10828

-+			     struct bfq_entity *entity)

10829

-+{

10830

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10831

-+	struct rb_node *next;

10832

-+

10833

-+	BUG_ON(entity->tree != &st->idle);

10834

-+

10835

-+	if (entity == st->first_idle) {

10836

-+		next = rb_next(&entity->rb_node);

10837

-+		st->first_idle = bfq_entity_of(next);

10838

-+	}

10839

-+

10840

-+	if (entity == st->last_idle) {

10841

-+		next = rb_prev(&entity->rb_node);

10842

-+		st->last_idle = bfq_entity_of(next);

10843

-+	}

10844

-+

10845

-+	bfq_extract(&st->idle, entity);

10846

-+

10847

-+	if (bfqq != NULL)

10848

-+		list_del(&bfqq->bfqq_list);

10849

-+}

10850

-+

10851

-+/**

10852

-+ * bfq_insert - generic tree insertion.

10853

-+ * @root: tree root.

10854

-+ * @entity: entity to insert.

10855

-+ *

10856

-+ * This is used for the idle and the active tree, since they are both

10857

-+ * ordered by finish time.

10858

-+ */

10859

-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

10860

-+{

10861

-+	struct bfq_entity *entry;

10862

-+	struct rb_node **node = &root->rb_node;

10863

-+	struct rb_node *parent = NULL;

10864

-+

10865

-+	BUG_ON(entity->tree != NULL);

10866

-+

10867

-+	while (*node != NULL) {

10868

-+		parent = *node;

10869

-+		entry = rb_entry(parent, struct bfq_entity, rb_node);

10870

-+

10871

-+		if (bfq_gt(entry->finish, entity->finish))

10872

-+			node = &parent->rb_left;

10873

-+		else

10874

-+			node = &parent->rb_right;

10875

-+	}

10876

-+

10877

-+	rb_link_node(&entity->rb_node, parent, node);

10878

-+	rb_insert_color(&entity->rb_node, root);

10879

-+

10880

-+	entity->tree = root;

10881

-+}

10882

-+

10883

-+/**

10884

-+ * bfq_update_min - update the min_start field of a entity.

10885

-+ * @entity: the entity to update.

10886

-+ * @node: one of its children.

10887

-+ *

10888

-+ * This function is called when @entity may store an invalid value for

10889

-+ * min_start due to updates to the active tree.  The function  assumes

10890

-+ * that the subtree rooted at @node (which may be its left or its right

10891

-+ * child) has a valid min_start value.

10892

-+ */

10893

-+static inline void bfq_update_min(struct bfq_entity *entity,

10894

-+				  struct rb_node *node)

10895

-+{

10896

-+	struct bfq_entity *child;

10897

-+

10898

-+	if (node != NULL) {

10899

-+		child = rb_entry(node, struct bfq_entity, rb_node);

10900

-+		if (bfq_gt(entity->min_start, child->min_start))

10901

-+			entity->min_start = child->min_start;

10902

-+	}

10903

-+}

10904

-+

10905

-+/**

10906

-+ * bfq_update_active_node - recalculate min_start.

10907

-+ * @node: the node to update.

10908

-+ *

10909

-+ * @node may have changed position or one of its children may have moved,

10910

-+ * this function updates its min_start value.  The left and right subtrees

10911

-+ * are assumed to hold a correct min_start value.

10912

-+ */

10913

-+static inline void bfq_update_active_node(struct rb_node *node)

10914

-+{

10915

-+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

10916

-+

10917

-+	entity->min_start = entity->start;

10918

-+	bfq_update_min(entity, node->rb_right);

10919

-+	bfq_update_min(entity, node->rb_left);

10920

-+}

10921

-+

10922

-+/**

10923

-+ * bfq_update_active_tree - update min_start for the whole active tree.

10924

-+ * @node: the starting node.

10925

-+ *

10926

-+ * @node must be the deepest modified node after an update.  This function

10927

-+ * updates its min_start using the values held by its children, assuming

10928

-+ * that they did not change, and then updates all the nodes that may have

10929

-+ * changed in the path to the root.  The only nodes that may have changed

10930

-+ * are the ones in the path or their siblings.

10931

-+ */

10932

-+static void bfq_update_active_tree(struct rb_node *node)

10933

-+{

10934

-+	struct rb_node *parent;

10935

-+

10936

-+up:

10937

-+	bfq_update_active_node(node);

10938

-+

10939

-+	parent = rb_parent(node);

10940

-+	if (parent == NULL)

10941

-+		return;

10942

-+

10943

-+	if (node == parent->rb_left && parent->rb_right != NULL)

10944

-+		bfq_update_active_node(parent->rb_right);

10945

-+	else if (parent->rb_left != NULL)

10946

-+		bfq_update_active_node(parent->rb_left);

10947

-+

10948

-+	node = parent;

10949

-+	goto up;

10950

-+}

10951

-+

10952

-+/**

10953

-+ * bfq_active_insert - insert an entity in the active tree of its group/device.

10954

-+ * @st: the service tree of the entity.

10955

-+ * @entity: the entity being inserted.

10956

-+ *

10957

-+ * The active tree is ordered by finish time, but an extra key is kept

10958

-+ * per each node, containing the minimum value for the start times of

10959

-+ * its children (and the node itself), so it's possible to search for

10960

-+ * the eligible node with the lowest finish time in logarithmic time.

10961

-+ */

10962

-+static void bfq_active_insert(struct bfq_service_tree *st,

10963

-+			      struct bfq_entity *entity)

10964

-+{

10965

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

10966

-+	struct rb_node *node = &entity->rb_node;

10967

-+

10968

-+	bfq_insert(&st->active, entity);

10969

-+

10970

-+	if (node->rb_left != NULL)

10971

-+		node = node->rb_left;

10972

-+	else if (node->rb_right != NULL)

10973

-+		node = node->rb_right;

10974

-+

10975

-+	bfq_update_active_tree(node);

10976

-+

10977

-+	if (bfqq != NULL)

10978

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

10979

-+}

10980

-+

10981

-+/**

10982

-+ * bfq_ioprio_to_weight - calc a weight from an ioprio.

10983

-+ * @ioprio: the ioprio value to convert.

10984

-+ */

10985

-+static unsigned short bfq_ioprio_to_weight(int ioprio)

10986

-+{

10987

-+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

10988

-+	return IOPRIO_BE_NR - ioprio;

10989

-+}

10990

-+

10991

-+/**

10992

-+ * bfq_weight_to_ioprio - calc an ioprio from a weight.

10993

-+ * @weight: the weight value to convert.

10994

-+ *

10995

-+ * To preserve as mush as possible the old only-ioprio user interface,

10996

-+ * 0 is used as an escape ioprio value for weights (numerically) equal or

10997

-+ * larger than IOPRIO_BE_NR

10998

-+ */

10999

-+static unsigned short bfq_weight_to_ioprio(int weight)

11000

-+{

11001

-+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

11002

-+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

11003

-+}

11004

-+

11005

-+static inline void bfq_get_entity(struct bfq_entity *entity)

11006

-+{

11007

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11008

-+	struct bfq_sched_data *sd;

11009

-+

11010

-+	if (bfqq != NULL) {

11011

-+		sd = entity->sched_data;

11012

-+		atomic_inc(&bfqq->ref);

11013

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

11014

-+			     bfqq, atomic_read(&bfqq->ref));

11015

-+	}

11016

-+}

11017

-+

11018

-+/**

11019

-+ * bfq_find_deepest - find the deepest node that an extraction can modify.

11020

-+ * @node: the node being removed.

11021

-+ *

11022

-+ * Do the first step of an extraction in an rb tree, looking for the

11023

-+ * node that will replace @node, and returning the deepest node that

11024

-+ * the following modifications to the tree can touch.  If @node is the

11025

-+ * last node in the tree return %NULL.

11026

-+ */

11027

-+static struct rb_node *bfq_find_deepest(struct rb_node *node)

11028

-+{

11029

-+	struct rb_node *deepest;

11030

-+

11031

-+	if (node->rb_right == NULL && node->rb_left == NULL)

11032

-+		deepest = rb_parent(node);

11033

-+	else if (node->rb_right == NULL)

11034

-+		deepest = node->rb_left;

11035

-+	else if (node->rb_left == NULL)

11036

-+		deepest = node->rb_right;

11037

-+	else {

11038

-+		deepest = rb_next(node);

11039

-+		if (deepest->rb_right != NULL)

11040

-+			deepest = deepest->rb_right;

11041

-+		else if (rb_parent(deepest) != node)

11042

-+			deepest = rb_parent(deepest);

11043

-+	}

11044

-+

11045

-+	return deepest;

11046

-+}

11047

-+

11048

-+/**

11049

-+ * bfq_active_extract - remove an entity from the active tree.

11050

-+ * @st: the service_tree containing the tree.

11051

-+ * @entity: the entity being removed.

11052

-+ */

11053

-+static void bfq_active_extract(struct bfq_service_tree *st,

11054

-+			       struct bfq_entity *entity)

11055

-+{

11056

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11057

-+	struct rb_node *node;

11058

-+

11059

-+	node = bfq_find_deepest(&entity->rb_node);

11060

-+	bfq_extract(&st->active, entity);

11061

-+

11062

-+	if (node != NULL)

11063

-+		bfq_update_active_tree(node);

11064

-+

11065

-+	if (bfqq != NULL)

11066

-+		list_del(&bfqq->bfqq_list);

11067

-+}

11068

-+

11069

-+/**

11070

-+ * bfq_idle_insert - insert an entity into the idle tree.

11071

-+ * @st: the service tree containing the tree.

11072

-+ * @entity: the entity to insert.

11073

-+ */

11074

-+static void bfq_idle_insert(struct bfq_service_tree *st,

11075

-+			    struct bfq_entity *entity)

11076

-+{

11077

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11078

-+	struct bfq_entity *first_idle = st->first_idle;

11079

-+	struct bfq_entity *last_idle = st->last_idle;

11080

-+

11081

-+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

11082

-+		st->first_idle = entity;

11083

-+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

11084

-+		st->last_idle = entity;

11085

-+

11086

-+	bfq_insert(&st->idle, entity);

11087

-+

11088

-+	if (bfqq != NULL)

11089

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

11090

-+}

11091

-+

11092

-+/**

11093

-+ * bfq_forget_entity - remove an entity from the wfq trees.

11094

-+ * @st: the service tree.

11095

-+ * @entity: the entity being removed.

11096

-+ *

11097

-+ * Update the device status and forget everything about @entity, putting

11098

-+ * the device reference to it, if it is a queue.  Entities belonging to

11099

-+ * groups are not refcounted.

11100

-+ */

11101

-+static void bfq_forget_entity(struct bfq_service_tree *st,

11102

-+			      struct bfq_entity *entity)

11103

-+{

11104

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11105

-+	struct bfq_sched_data *sd;

11106

-+

11107

-+	BUG_ON(!entity->on_st);

11108

-+

11109

-+	entity->on_st = 0;

11110

-+	st->wsum -= entity->weight;

11111

-+	if (bfqq != NULL) {

11112

-+		sd = entity->sched_data;

11113

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

11114

-+			     bfqq, atomic_read(&bfqq->ref));

11115

-+		bfq_put_queue(bfqq);

11116

-+	}

11117

-+}

11118

-+

11119

-+/**

11120

-+ * bfq_put_idle_entity - release the idle tree ref of an entity.

11121

-+ * @st: service tree for the entity.

11122

-+ * @entity: the entity being released.

11123

-+ */

11124

-+static void bfq_put_idle_entity(struct bfq_service_tree *st,

11125

-+				struct bfq_entity *entity)

11126

-+{

11127

-+	bfq_idle_extract(st, entity);

11128

-+	bfq_forget_entity(st, entity);

11129

-+}

11130

-+

11131

-+/**

11132

-+ * bfq_forget_idle - update the idle tree if necessary.

11133

-+ * @st: the service tree to act upon.

11134

-+ *

11135

-+ * To preserve the global O(log N) complexity we only remove one entry here;

11136

-+ * as the idle tree will not grow indefinitely this can be done safely.

11137

-+ */

11138

-+static void bfq_forget_idle(struct bfq_service_tree *st)

11139

-+{

11140

-+	struct bfq_entity *first_idle = st->first_idle;

11141

-+	struct bfq_entity *last_idle = st->last_idle;

11142

-+

11143

-+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

11144

-+	    !bfq_gt(last_idle->finish, st->vtime)) {

11145

-+		/*

11146

-+		 * Forget the whole idle tree, increasing the vtime past

11147

-+		 * the last finish time of idle entities.

11148

-+		 */

11149

-+		st->vtime = last_idle->finish;

11150

-+	}

11151

-+

11152

-+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

11153

-+		bfq_put_idle_entity(st, first_idle);

11154

-+}

11155

-+

11156

-+static struct bfq_service_tree *

11157

-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

11158

-+			 struct bfq_entity *entity)

11159

-+{

11160

-+	struct bfq_service_tree *new_st = old_st;

11161

-+

11162

-+	if (entity->ioprio_changed) {

11163

-+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11164

-+

11165

-+		BUG_ON(old_st->wsum < entity->weight);

11166

-+		old_st->wsum -= entity->weight;

11167

-+

11168

-+		if (entity->new_weight != entity->orig_weight) {

11169

-+			entity->orig_weight = entity->new_weight;

11170

-+			entity->ioprio =

11171

-+				bfq_weight_to_ioprio(entity->orig_weight);

11172

-+		} else if (entity->new_ioprio != entity->ioprio) {

11173

-+			entity->ioprio = entity->new_ioprio;

11174

-+			entity->orig_weight =

11175

-+					bfq_ioprio_to_weight(entity->ioprio);

11176

-+		} else

11177

-+			entity->new_weight = entity->orig_weight =

11178

-+				bfq_ioprio_to_weight(entity->ioprio);

11179

-+

11180

-+		entity->ioprio_class = entity->new_ioprio_class;

11181

-+		entity->ioprio_changed = 0;

11182

-+

11183

-+		/*

11184

-+		 * NOTE: here we may be changing the weight too early,

11185

-+		 * this will cause unfairness.  The correct approach

11186

-+		 * would have required additional complexity to defer

11187

-+		 * weight changes to the proper time instants (i.e.,

11188

-+		 * when entity->finish <= old_st->vtime).

11189

-+		 */

11190

-+		new_st = bfq_entity_service_tree(entity);

11191

-+		entity->weight = entity->orig_weight *

11192

-+			(bfqq != NULL ? bfqq->raising_coeff : 1);

11193

-+		new_st->wsum += entity->weight;

11194

-+

11195

-+		if (new_st != old_st)

11196

-+			entity->start = new_st->vtime;

11197

-+	}

11198

-+

11199

-+	return new_st;

11200

-+}

11201

-+

11202

-+/**

11203

-+ * bfq_bfqq_served - update the scheduler status after selection for service.

11204

-+ * @bfqq: the queue being served.

11205

-+ * @served: bytes to transfer.

11206

-+ *

11207

-+ * NOTE: this can be optimized, as the timestamps of upper level entities

11208

-+ * are synchronized every time a new bfqq is selected for service.  By now,

11209

-+ * we keep it to better check consistency.

11210

-+ */

11211

-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

11212

-+{

11213

-+	struct bfq_entity *entity = &bfqq->entity;

11214

-+	struct bfq_service_tree *st;

11215

-+

11216

-+	for_each_entity(entity) {

11217

-+		st = bfq_entity_service_tree(entity);

11218

-+

11219

-+		entity->service += served;

11220

-+		BUG_ON(entity->service > entity->budget);

11221

-+		BUG_ON(st->wsum == 0);

11222

-+

11223

-+		st->vtime += bfq_delta(served, st->wsum);

11224

-+		bfq_forget_idle(st);

11225

-+	}

11226

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

11227

-+}

11228

-+

11229

-+/**

11230

-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

11231

-+ * @bfqq: the queue that needs a service update.

11232

-+ *

11233

-+ * When it's not possible to be fair in the service domain, because

11234

-+ * a queue is not consuming its budget fast enough (the meaning of

11235

-+ * fast depends on the timeout parameter), we charge it a full

11236

-+ * budget.  In this way we should obtain a sort of time-domain

11237

-+ * fairness among all the seeky/slow queues.

11238

-+ */

11239

-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

11240

-+{

11241

-+	struct bfq_entity *entity = &bfqq->entity;

11242

-+

11243

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

11244

-+

11245

-+	bfq_bfqq_served(bfqq, entity->budget - entity->service);

11246

-+}

11247

-+

11248

-+/**

11249

-+ * __bfq_activate_entity - activate an entity.

11250

-+ * @entity: the entity being activated.

11251

-+ *

11252

-+ * Called whenever an entity is activated, i.e., it is not active and one

11253

-+ * of its children receives a new request, or has to be reactivated due to

11254

-+ * budget exhaustion.  It uses the current budget of the entity (and the

11255

-+ * service received if @entity is active) of the queue to calculate its

11256

-+ * timestamps.

11257

-+ */

11258

-+static void __bfq_activate_entity(struct bfq_entity *entity)

11259

-+{

11260

-+	struct bfq_sched_data *sd = entity->sched_data;

11261

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11262

-+

11263

-+	if (entity == sd->in_service_entity) {

11264

-+		BUG_ON(entity->tree != NULL);

11265

-+		/*

11266

-+		 * If we are requeueing the current entity we have

11267

-+		 * to take care of not charging to it service it has

11268

-+		 * not received.

11269

-+		 */

11270

-+		bfq_calc_finish(entity, entity->service);

11271

-+		entity->start = entity->finish;

11272

-+		sd->in_service_entity = NULL;

11273

-+	} else if (entity->tree == &st->active) {

11274

-+		/*

11275

-+		 * Requeueing an entity due to a change of some

11276

-+		 * next_in_service entity below it.  We reuse the

11277

-+		 * old start time.

11278

-+		 */

11279

-+		bfq_active_extract(st, entity);

11280

-+	} else if (entity->tree == &st->idle) {

11281

-+		/*

11282

-+		 * Must be on the idle tree, bfq_idle_extract() will

11283

-+		 * check for that.

11284

-+		 */

11285

-+		bfq_idle_extract(st, entity);

11286

-+		entity->start = bfq_gt(st->vtime, entity->finish) ?

11287

-+				       st->vtime : entity->finish;

11288

-+	} else {

11289

-+		/*

11290

-+		 * The finish time of the entity may be invalid, and

11291

-+		 * it is in the past for sure, otherwise the queue

11292

-+		 * would have been on the idle tree.

11293

-+		 */

11294

-+		entity->start = st->vtime;

11295

-+		st->wsum += entity->weight;

11296

-+		bfq_get_entity(entity);

11297

-+

11298

-+		BUG_ON(entity->on_st);

11299

-+		entity->on_st = 1;

11300

-+	}

11301

-+

11302

-+	st = __bfq_entity_update_weight_prio(st, entity);

11303

-+	bfq_calc_finish(entity, entity->budget);

11304

-+	bfq_active_insert(st, entity);

11305

-+}

11306

-+

11307

-+/**

11308

-+ * bfq_activate_entity - activate an entity and its ancestors if necessary.

11309

-+ * @entity: the entity to activate.

11310

-+ *

11311

-+ * Activate @entity and all the entities on the path from it to the root.

11312

-+ */

11313

-+static void bfq_activate_entity(struct bfq_entity *entity)

11314

-+{

11315

-+	struct bfq_sched_data *sd;

11316

-+

11317

-+	for_each_entity(entity) {

11318

-+		__bfq_activate_entity(entity);

11319

-+

11320

-+		sd = entity->sched_data;

11321

-+		if (!bfq_update_next_in_service(sd))

11322

-+			/*

11323

-+			 * No need to propagate the activation to the

11324

-+			 * upper entities, as they will be updated when

11325

-+			 * the in-service entity is rescheduled.

11326

-+			 */

11327

-+			break;

11328

-+	}

11329

-+}

11330

-+

11331

-+/**

11332

-+ * __bfq_deactivate_entity - deactivate an entity from its service tree.

11333

-+ * @entity: the entity to deactivate.

11334

-+ * @requeue: if false, the entity will not be put into the idle tree.

11335

-+ *

11336

-+ * Deactivate an entity, independently from its previous state.  If the

11337

-+ * entity was not on a service tree just return, otherwise if it is on

11338

-+ * any scheduler tree, extract it from that tree, and if necessary

11339

-+ * and if the caller did not specify @requeue, put it on the idle tree.

11340

-+ *

11341

-+ * Return %1 if the caller should update the entity hierarchy, i.e.,

11342

-+ * if the entity was under service or if it was the next_in_service for

11343

-+ * its sched_data; return %0 otherwise.

11344

-+ */

11345

-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11346

-+{

11347

-+	struct bfq_sched_data *sd = entity->sched_data;

11348

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11349

-+	int was_in_service = entity == sd->in_service_entity;

11350

-+	int ret = 0;

11351

-+

11352

-+	if (!entity->on_st)

11353

-+		return 0;

11354

-+

11355

-+	BUG_ON(was_in_service && entity->tree != NULL);

11356

-+

11357

-+	if (was_in_service) {

11358

-+		bfq_calc_finish(entity, entity->service);

11359

-+		sd->in_service_entity = NULL;

11360

-+	} else if (entity->tree == &st->active)

11361

-+		bfq_active_extract(st, entity);

11362

-+	else if (entity->tree == &st->idle)

11363

-+		bfq_idle_extract(st, entity);

11364

-+	else if (entity->tree != NULL)

11365

-+		BUG();

11366

-+

11367

-+	if (was_in_service || sd->next_in_service == entity)

11368

-+		ret = bfq_update_next_in_service(sd);

11369

-+

11370

-+	if (!requeue || !bfq_gt(entity->finish, st->vtime))

11371

-+		bfq_forget_entity(st, entity);

11372

-+	else

11373

-+		bfq_idle_insert(st, entity);

11374

-+

11375

-+	BUG_ON(sd->in_service_entity == entity);

11376

-+	BUG_ON(sd->next_in_service == entity);

11377

-+

11378

-+	return ret;

11379

-+}

11380

-+

11381

-+/**

11382

-+ * bfq_deactivate_entity - deactivate an entity.

11383

-+ * @entity: the entity to deactivate.

11384

-+ * @requeue: true if the entity can be put on the idle tree

11385

-+ */

11386

-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

11387

-+{

11388

-+	struct bfq_sched_data *sd;

11389

-+	struct bfq_entity *parent;

11390

-+

11391

-+	for_each_entity_safe(entity, parent) {

11392

-+		sd = entity->sched_data;

11393

-+

11394

-+		if (!__bfq_deactivate_entity(entity, requeue))

11395

-+			/*

11396

-+			 * The parent entity is still backlogged, and

11397

-+			 * we don't need to update it as it is still

11398

-+			 * under service.

11399

-+			 */

11400

-+			break;

11401

-+

11402

-+		if (sd->next_in_service != NULL)

11403

-+			/*

11404

-+			 * The parent entity is still backlogged and

11405

-+			 * the budgets on the path towards the root

11406

-+			 * need to be updated.

11407

-+			 */

11408

-+			goto update;

11409

-+

11410

-+		/*

11411

-+		 * If we reach there the parent is no more backlogged and

11412

-+		 * we want to propagate the dequeue upwards.

11413

-+		 */

11414

-+		requeue = 1;

11415

-+	}

11416

-+

11417

-+	return;

11418

-+

11419

-+update:

11420

-+	entity = parent;

11421

-+	for_each_entity(entity) {

11422

-+		__bfq_activate_entity(entity);

11423

-+

11424

-+		sd = entity->sched_data;

11425

-+		if (!bfq_update_next_in_service(sd))

11426

-+			break;

11427

-+	}

11428

-+}

11429

-+

11430

-+/**

11431

-+ * bfq_update_vtime - update vtime if necessary.

11432

-+ * @st: the service tree to act upon.

11433

-+ *

11434

-+ * If necessary update the service tree vtime to have at least one

11435

-+ * eligible entity, skipping to its start time.  Assumes that the

11436

-+ * active tree of the device is not empty.

11437

-+ *

11438

-+ * NOTE: this hierarchical implementation updates vtimes quite often,

11439

-+ * we may end up with reactivated tasks getting timestamps after a

11440

-+ * vtime skip done because we needed a ->first_active entity on some

11441

-+ * intermediate node.

11442

-+ */

11443

-+static void bfq_update_vtime(struct bfq_service_tree *st)

11444

-+{

11445

-+	struct bfq_entity *entry;

11446

-+	struct rb_node *node = st->active.rb_node;

11447

-+

11448

-+	entry = rb_entry(node, struct bfq_entity, rb_node);

11449

-+	if (bfq_gt(entry->min_start, st->vtime)) {

11450

-+		st->vtime = entry->min_start;

11451

-+		bfq_forget_idle(st);

11452

-+	}

11453

-+}

11454

-+

11455

-+/**

11456

-+ * bfq_first_active_entity - find the eligible entity with

11457

-+ *                           the smallest finish time

11458

-+ * @st: the service tree to select from.

11459

-+ *

11460

-+ * This function searches the first schedulable entity, starting from the

11461

-+ * root of the tree and going on the left every time on this side there is

11462

-+ * a subtree with at least one eligible (start >= vtime) entity.  The path

11463

-+ * on the right is followed only if a) the left subtree contains no eligible

11464

-+ * entities and b) no eligible entity has been found yet.

11465

-+ */

11466

-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

11467

-+{

11468

-+	struct bfq_entity *entry, *first = NULL;

11469

-+	struct rb_node *node = st->active.rb_node;

11470

-+

11471

-+	while (node != NULL) {

11472

-+		entry = rb_entry(node, struct bfq_entity, rb_node);

11473

-+left:

11474

-+		if (!bfq_gt(entry->start, st->vtime))

11475

-+			first = entry;

11476

-+

11477

-+		BUG_ON(bfq_gt(entry->min_start, st->vtime));

11478

-+

11479

-+		if (node->rb_left != NULL) {

11480

-+			entry = rb_entry(node->rb_left,

11481

-+					 struct bfq_entity, rb_node);

11482

-+			if (!bfq_gt(entry->min_start, st->vtime)) {

11483

-+				node = node->rb_left;

11484

-+				goto left;

11485

-+			}

11486

-+		}

11487

-+		if (first != NULL)

11488

-+			break;

11489

-+		node = node->rb_right;

11490

-+	}

11491

-+

11492

-+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

11493

-+	return first;

11494

-+}

11495

-+

11496

-+/**

11497

-+ * __bfq_lookup_next_entity - return the first eligible entity in @st.

11498

-+ * @st: the service tree.

11499

-+ *

11500

-+ * Update the virtual time in @st and return the first eligible entity

11501

-+ * it contains.

11502

-+ */

11503

-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

11504

-+						   bool force)

11505

-+{

11506

-+	struct bfq_entity *entity, *new_next_in_service = NULL;

11507

-+

11508

-+	if (RB_EMPTY_ROOT(&st->active))

11509

-+		return NULL;

11510

-+

11511

-+	bfq_update_vtime(st);

11512

-+	entity = bfq_first_active_entity(st);

11513

-+	BUG_ON(bfq_gt(entity->start, st->vtime));

11514

-+

11515

-+	/*

11516

-+	 * If the chosen entity does not match with the sched_data's

11517

-+	 * next_in_service and we are forcedly serving the IDLE priority

11518

-+	 * class tree, bubble up budget update.

11519

-+	 */

11520

-+	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

11521

-+		new_next_in_service = entity;

11522

-+		for_each_entity(new_next_in_service)

11523

-+			bfq_update_budget(new_next_in_service);

11524

-+	}

11525

-+

11526

-+	return entity;

11527

-+}

11528

-+

11529

-+/**

11530

-+ * bfq_lookup_next_entity - return the first eligible entity in @sd.

11531

-+ * @sd: the sched_data.

11532

-+ * @extract: if true the returned entity will be also extracted from @sd.

11533

-+ *

11534

-+ * NOTE: since we cache the next_in_service entity at each level of the

11535

-+ * hierarchy, the complexity of the lookup can be decreased with

11536

-+ * absolutely no effort just returning the cached next_in_service value;

11537

-+ * we prefer to do full lookups to test the consistency of * the data

11538

-+ * structures.

11539

-+ */

11540

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

11541

-+						 int extract,

11542

-+						 struct bfq_data *bfqd)

11543

-+{

11544

-+	struct bfq_service_tree *st = sd->service_tree;

11545

-+	struct bfq_entity *entity;

11546

-+	int i = 0;

11547

-+

11548

-+	BUG_ON(sd->in_service_entity != NULL);

11549

-+

11550

-+	if (bfqd != NULL &&

11551

-+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

11552

-+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

11553

-+						  true);

11554

-+		if (entity != NULL) {

11555

-+			i = BFQ_IOPRIO_CLASSES - 1;

11556

-+			bfqd->bfq_class_idle_last_service = jiffies;

11557

-+			sd->next_in_service = entity;

11558

-+		}

11559

-+	}

11560

-+	for (; i < BFQ_IOPRIO_CLASSES; i++) {

11561

-+		entity = __bfq_lookup_next_entity(st + i, false);

11562

-+		if (entity != NULL) {

11563

-+			if (extract) {

11564

-+				bfq_check_next_in_service(sd, entity);

11565

-+				bfq_active_extract(st + i, entity);

11566

-+				sd->in_service_entity = entity;

11567

-+				sd->next_in_service = NULL;

11568

-+			}

11569

-+			break;

11570

-+		}

11571

-+	}

11572

-+

11573

-+	return entity;

11574

-+}

11575

-+

11576

-+/*

11577

-+ * Get next queue for service.

11578

-+ */

11579

-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

11580

-+{

11581

-+	struct bfq_entity *entity = NULL;

11582

-+	struct bfq_sched_data *sd;

11583

-+	struct bfq_queue *bfqq;

11584

-+

11585

-+	BUG_ON(bfqd->in_service_queue != NULL);

11586

-+

11587

-+	if (bfqd->busy_queues == 0)

11588

-+		return NULL;

11589

-+

11590

-+	sd = &bfqd->root_group->sched_data;

11591

-+	for (; sd != NULL; sd = entity->my_sched_data) {

11592

-+		entity = bfq_lookup_next_entity(sd, 1, bfqd);

11593

-+		BUG_ON(entity == NULL);

11594

-+		entity->service = 0;

11595

-+	}

11596

-+

11597

-+	bfqq = bfq_entity_to_bfqq(entity);

11598

-+	BUG_ON(bfqq == NULL);

11599

-+

11600

-+	return bfqq;

11601

-+}

11602

-+

11603

-+/*

11604

-+ * Forced extraction of the given queue.

11605

-+ */

11606

-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

11607

-+				      struct bfq_queue *bfqq)

11608

-+{

11609

-+	struct bfq_entity *entity;

11610

-+	struct bfq_sched_data *sd;

11611

-+

11612

-+	BUG_ON(bfqd->in_service_queue != NULL);

11613

-+

11614

-+	entity = &bfqq->entity;

11615

-+	/*

11616

-+	 * Bubble up extraction/update from the leaf to the root.

11617

-+	*/

11618

-+	for_each_entity(entity) {

11619

-+		sd = entity->sched_data;

11620

-+		bfq_update_budget(entity);

11621

-+		bfq_update_vtime(bfq_entity_service_tree(entity));

11622

-+		bfq_active_extract(bfq_entity_service_tree(entity), entity);

11623

-+		sd->active_entity = entity;

11624

-+		sd->next_active = NULL;

11625

-+		entity->service = 0;

11626

-+	}

11627

-+

11628

-+	return;

11629

-+}

11630

-+

11631

-+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

11632

-+{

11633

-+	if (bfqd->in_service_bic != NULL) {

11634

-+		put_io_context(bfqd->in_service_bic->icq.ioc);

11635

-+		bfqd->in_service_bic = NULL;

11636

-+	}

11637

-+

11638

-+	bfqd->in_service_queue = NULL;

11639

-+	del_timer(&bfqd->idle_slice_timer);

11640

-+}

11641

-+

11642

-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11643

-+				int requeue)

11644

-+{

11645

-+	struct bfq_entity *entity = &bfqq->entity;

11646

-+

11647

-+	if (bfqq == bfqd->in_service_queue)

11648

-+		__bfq_bfqd_reset_in_service(bfqd);

11649

-+

11650

-+	bfq_deactivate_entity(entity, requeue);

11651

-+}

11652

-+

11653

-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11654

-+{

11655

-+	struct bfq_entity *entity = &bfqq->entity;

11656

-+

11657

-+	bfq_activate_entity(entity);

11658

-+}

11659

-+

11660

-+/*

11661

-+ * Called when the bfqq no longer has requests pending, remove it from

11662

-+ * the service tree.

11663

-+ */

11664

-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

11665

-+			      int requeue)

11666

-+{

11667

-+	BUG_ON(!bfq_bfqq_busy(bfqq));

11668

-+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

11669

-+

11670

-+	bfq_log_bfqq(bfqd, bfqq, "del from busy");

11671

-+

11672

-+	bfq_clear_bfqq_busy(bfqq);

11673

-+

11674

-+	BUG_ON(bfqd->busy_queues == 0);

11675

-+	bfqd->busy_queues--;

11676

-+	if (bfqq->raising_coeff > 1)

11677

-+		bfqd->raised_busy_queues--;

11678

-+

11679

-+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

11680

-+}

11681

-+

11682

-+/*

11683

-+ * Called when an inactive queue receives a new request.

11684

-+ */

11685

-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

11686

-+{

11687

-+	BUG_ON(bfq_bfqq_busy(bfqq));

11688

-+	BUG_ON(bfqq == bfqd->in_service_queue);

11689

-+

11690

-+	bfq_log_bfqq(bfqd, bfqq, "add to busy");

11691

-+

11692

-+	bfq_activate_bfqq(bfqd, bfqq);

11693

-+

11694

-+	bfq_mark_bfqq_busy(bfqq);

11695

-+	bfqd->busy_queues++;

11696

-+	if (bfqq->raising_coeff > 1)

11697

-+		bfqd->raised_busy_queues++;

11698

-+}

11699

-diff --git a/block/bfq.h b/block/bfq.h

11700

-new file mode 100644

11701

-index 0000000..f9b5881

11702

---- /dev/null

11703

-+++ b/block/bfq.h

11704

-@@ -0,0 +1,614 @@

11705

-+/*

11706

-+ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.

11707

-+ *

11708

-+ * Based on ideas and code from CFQ:

11709

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

11710

-+ *

11711

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

11712

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

11713

-+ *

11714

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

11715

-+ */

11716

-+

11717

-+#ifndef _BFQ_H

11718

-+#define _BFQ_H

11719

-+

11720

-+#include <linux/blktrace_api.h>

11721

-+#include <linux/hrtimer.h>

11722

-+#include <linux/ioprio.h>

11723

-+#include <linux/rbtree.h>

11724

-+

11725

-+#define BFQ_IOPRIO_CLASSES	3

11726

-+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

11727

-+

11728

-+#define BFQ_MIN_WEIGHT	1

11729

-+#define BFQ_MAX_WEIGHT	1000

11730

-+

11731

-+#define BFQ_DEFAULT_GRP_WEIGHT	10

11732

-+#define BFQ_DEFAULT_GRP_IOPRIO	0

11733

-+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

11734

-+

11735

-+struct bfq_entity;

11736

-+

11737

-+/**

11738

-+ * struct bfq_service_tree - per ioprio_class service tree.

11739

-+ * @active: tree for active entities (i.e., those backlogged).

11740

-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

11741

-+ * @first_idle: idle entity with minimum F_i.

11742

-+ * @last_idle: idle entity with maximum F_i.

11743

-+ * @vtime: scheduler virtual time.

11744

-+ * @wsum: scheduler weight sum; active and idle entities contribute to it.

11745

-+ *

11746

-+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

11747

-+ * ioprio_class has its own independent scheduler, and so its own

11748

-+ * bfq_service_tree.  All the fields are protected by the queue lock

11749

-+ * of the containing bfqd.

11750

-+ */

11751

-+struct bfq_service_tree {

11752

-+	struct rb_root active;

11753

-+	struct rb_root idle;

11754

-+

11755

-+	struct bfq_entity *first_idle;

11756

-+	struct bfq_entity *last_idle;

11757

-+

11758

-+	u64 vtime;

11759

-+	unsigned long wsum;

11760

-+};

11761

-+

11762

-+/**

11763

-+ * struct bfq_sched_data - multi-class scheduler.

11764

-+ * @in_service_entity: entity under service.

11765

-+ * @next_in_service: head-of-the-line entity in the scheduler.

11766

-+ * @service_tree: array of service trees, one per ioprio_class.

11767

-+ *

11768

-+ * bfq_sched_data is the basic scheduler queue.  It supports three

11769

-+ * ioprio_classes, and can be used either as a toplevel queue or as

11770

-+ * an intermediate queue on a hierarchical setup.

11771

-+ * @next_in_service points to the active entity of the sched_data

11772

-+ * service trees that will be scheduled next.

11773

-+ *

11774

-+ * The supported ioprio_classes are the same as in CFQ, in descending

11775

-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

11776

-+ * Requests from higher priority queues are served before all the

11777

-+ * requests from lower priority queues; among requests of the same

11778

-+ * queue requests are served according to B-WF2Q+.

11779

-+ * All the fields are protected by the queue lock of the containing bfqd.

11780

-+ */

11781

-+struct bfq_sched_data {

11782

-+	struct bfq_entity *in_service_entity;

11783

-+	struct bfq_entity *next_in_service;

11784

-+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

11785

-+};

11786

-+

11787

-+/**

11788

-+ * struct bfq_entity - schedulable entity.

11789

-+ * @rb_node: service_tree member.

11790

-+ * @on_st: flag, true if the entity is on a tree (either the active or

11791

-+ *         the idle one of its service_tree).

11792

-+ * @finish: B-WF2Q+ finish timestamp (aka F_i).

11793

-+ * @start: B-WF2Q+ start timestamp (aka S_i).

11794

-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

11795

-+ * @min_start: minimum start time of the (active) subtree rooted at

11796

-+ *             this entity; used for O(log N) lookups into active trees.

11797

-+ * @service: service received during the last round of service.

11798

-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

11799

-+ * @weight: weight of the queue

11800

-+ * @parent: parent entity, for hierarchical scheduling.

11801

-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

11802

-+ *                 associated scheduler queue, %NULL on leaf nodes.

11803

-+ * @sched_data: the scheduler queue this entity belongs to.

11804

-+ * @ioprio: the ioprio in use.

11805

-+ * @new_weight: when a weight change is requested, the new weight value.

11806

-+ * @orig_weight: original weight, used to implement weight boosting

11807

-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

11808

-+ * @ioprio_class: the ioprio_class in use.

11809

-+ * @new_ioprio_class: when an ioprio_class change is requested, the new

11810

-+ *                    ioprio_class value.

11811

-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

11812

-+ *                  ioprio_class change.

11813

-+ *

11814

-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

11815

-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

11816

-+ * entity belongs to the sched_data of the parent group in the cgroup

11817

-+ * hierarchy.  Non-leaf entities have also their own sched_data, stored

11818

-+ * in @my_sched_data.

11819

-+ *

11820

-+ * Each entity stores independently its priority values; this would

11821

-+ * allow different weights on different devices, but this

11822

-+ * functionality is not exported to userspace by now.  Priorities and

11823

-+ * weights are updated lazily, first storing the new values into the

11824

-+ * new_* fields, then setting the @ioprio_changed flag.  As soon as

11825

-+ * there is a transition in the entity state that allows the priority

11826

-+ * update to take place the effective and the requested priority

11827

-+ * values are synchronized.

11828

-+ *

11829

-+ * Unless cgroups are used, the weight value is calculated from the

11830

-+ * ioprio to export the same interface as CFQ.  When dealing with

11831

-+ * ``well-behaved'' queues (i.e., queues that do not spend too much

11832

-+ * time to consume their budget and have true sequential behavior, and

11833

-+ * when there are no external factors breaking anticipation) the

11834

-+ * relative weights at each level of the cgroups hierarchy should be

11835

-+ * guaranteed.  All the fields are protected by the queue lock of the

11836

-+ * containing bfqd.

11837

-+ */

11838

-+struct bfq_entity {

11839

-+	struct rb_node rb_node;

11840

-+

11841

-+	int on_st;

11842

-+

11843

-+	u64 finish;

11844

-+	u64 start;

11845

-+

11846

-+	struct rb_root *tree;

11847

-+

11848

-+	u64 min_start;

11849

-+

11850

-+	unsigned long service, budget;

11851

-+	unsigned short weight, new_weight;

11852

-+	unsigned short orig_weight;

11853

-+

11854

-+	struct bfq_entity *parent;

11855

-+

11856

-+	struct bfq_sched_data *my_sched_data;

11857

-+	struct bfq_sched_data *sched_data;

11858

-+

11859

-+	unsigned short ioprio, new_ioprio;

11860

-+	unsigned short ioprio_class, new_ioprio_class;

11861

-+

11862

-+	int ioprio_changed;

11863

-+};

11864

-+

11865

-+struct bfq_group;

11866

-+

11867

-+/**

11868

-+ * struct bfq_queue - leaf schedulable entity.

11869

-+ * @ref: reference counter.

11870

-+ * @bfqd: parent bfq_data.

11871

-+ * @new_bfqq: shared bfq_queue if queue is cooperating with

11872

-+ *           one or more other queues.

11873

-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

11874

-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

11875

-+ * @sort_list: sorted list of pending requests.

11876

-+ * @next_rq: if fifo isn't expired, next request to serve.

11877

-+ * @queued: nr of requests queued in @sort_list.

11878

-+ * @allocated: currently allocated requests.

11879

-+ * @meta_pending: pending metadata requests.

11880

-+ * @fifo: fifo list of requests in sort_list.

11881

-+ * @entity: entity representing this queue in the scheduler.

11882

-+ * @max_budget: maximum budget allowed from the feedback mechanism.

11883

-+ * @budget_timeout: budget expiration (in jiffies).

11884

-+ * @dispatched: number of requests on the dispatch list or inside driver.

11885

-+ * @org_ioprio: saved ioprio during boosted periods.

11886

-+ * @flags: status flags.

11887

-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

11888

-+ * @seek_samples: number of seeks sampled

11889

-+ * @seek_total: sum of the distances of the seeks sampled

11890

-+ * @seek_mean: mean seek distance

11891

-+ * @last_request_pos: position of the last request enqueued

11892

-+ * @pid: pid of the process owning the queue, used for logging purposes.

11893

-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

11894

-+ * @raising_cur_max_time: current max raising time for this queue

11895

-+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

11896

-+ *                      idle to backlogged

11897

-+ * @service_from_backlogged: cumulative service received from the @bfq_queue

11898

-+ *                           since the last transition from idle to backlogged

11899

-+ *

11900

-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context

11901

-+ * or more (if it is an async one).  @cgroup holds a reference to the

11902

-+ * cgroup, to be sure that it does not disappear while a bfqq still

11903

-+ * references it (mostly to avoid races between request issuing and task

11904

-+ * migration followed by cgroup distruction).

11905

-+ * All the fields are protected by the queue lock of the containing bfqd.

11906

-+ */

11907

-+struct bfq_queue {

11908

-+	atomic_t ref;

11909

-+	struct bfq_data *bfqd;

11910

-+

11911

-+	/* fields for cooperating queues handling */

11912

-+	struct bfq_queue *new_bfqq;

11913

-+	struct rb_node pos_node;

11914

-+	struct rb_root *pos_root;

11915

-+

11916

-+	struct rb_root sort_list;

11917

-+	struct request *next_rq;

11918

-+	int queued[2];

11919

-+	int allocated[2];

11920

-+	int meta_pending;

11921

-+	struct list_head fifo;

11922

-+

11923

-+	struct bfq_entity entity;

11924

-+

11925

-+	unsigned long max_budget;

11926

-+	unsigned long budget_timeout;

11927

-+

11928

-+	int dispatched;

11929

-+

11930

-+	unsigned short org_ioprio;

11931

-+

11932

-+	unsigned int flags;

11933

-+

11934

-+	struct list_head bfqq_list;

11935

-+

11936

-+	unsigned int seek_samples;

11937

-+	u64 seek_total;

11938

-+	sector_t seek_mean;

11939

-+	sector_t last_request_pos;

11940

-+

11941

-+	pid_t pid;

11942

-+

11943

-+	/* weight-raising fields */

11944

-+	unsigned long raising_cur_max_time;

11945

-+	unsigned long soft_rt_next_start;

11946

-+	unsigned long last_rais_start_finish;

11947

-+	unsigned int raising_coeff;

11948

-+	unsigned long last_idle_bklogged;

11949

-+	unsigned long service_from_backlogged;

11950

-+};

11951

-+

11952

-+/**

11953

-+ * struct bfq_ttime - per process thinktime stats.

11954

-+ * @ttime_total: total process thinktime

11955

-+ * @ttime_samples: number of thinktime samples

11956

-+ * @ttime_mean: average process thinktime

11957

-+ */

11958

-+struct bfq_ttime {

11959

-+	unsigned long last_end_request;

11960

-+

11961

-+	unsigned long ttime_total;

11962

-+	unsigned long ttime_samples;

11963

-+	unsigned long ttime_mean;

11964

-+};

11965

-+

11966

-+/**

11967

-+ * struct bfq_io_cq - per (request_queue, io_context) structure.

11968

-+ * @icq: associated io_cq structure

11969

-+ * @bfqq: array of two process queues, the sync and the async

11970

-+ * @ttime: associated @bfq_ttime struct

11971

-+ */

11972

-+struct bfq_io_cq {

11973

-+	struct io_cq icq; /* must be the first member */

11974

-+	struct bfq_queue *bfqq[2];

11975

-+	struct bfq_ttime ttime;

11976

-+	int ioprio;

11977

-+};

11978

-+

11979

-+/**

11980

-+ * struct bfq_data - per device data structure.

11981

-+ * @queue: request queue for the managed device.

11982

-+ * @root_group: root bfq_group for the device.

11983

-+ * @rq_pos_tree: rbtree sorted by next_request position,

11984

-+ *		used when determining if two or more queues

11985

-+ *		have interleaving requests (see bfq_close_cooperator).

11986

-+ * @busy_queues: number of bfq_queues containing requests (including the

11987

-+ *		 queue under service, even if it is idling).

11988

-+ * @raised_busy_queues: number of weight-raised busy bfq_queues.

11989

-+ * @queued: number of queued requests.

11990

-+ * @rq_in_driver: number of requests dispatched and waiting for completion.

11991

-+ * @sync_flight: number of sync requests in the driver.

11992

-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

11993

-+ *		      completed requests .

11994

-+ * @hw_tag_samples: nr of samples used to calculate hw_tag.

11995

-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

11996

-+ * @budgets_assigned: number of budgets assigned.

11997

-+ * @idle_slice_timer: timer set when idling for the next sequential request

11998

-+ *                    from the queue under service.

11999

-+ * @unplug_work: delayed work to restart dispatching on the request queue.

12000

-+ * @in_service_queue: bfq_queue under service.

12001

-+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

12002

-+ * @last_position: on-disk position of the last served request.

12003

-+ * @last_budget_start: beginning of the last budget.

12004

-+ * @last_idling_start: beginning of the last idle slice.

12005

-+ * @peak_rate: peak transfer rate observed for a budget.

12006

-+ * @peak_rate_samples: number of samples used to calculate @peak_rate.

12007

-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

12008

-+ * @group_list: list of all the bfq_groups active on the device.

12009

-+ * @active_list: list of all the bfq_queues active on the device.

12010

-+ * @idle_list: list of all the bfq_queues idle on the device.

12011

-+ * @bfq_quantum: max number of requests dispatched per dispatch round.

12012

-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

12013

-+ *                   requests are served in fifo order.

12014

-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

12015

-+ * @bfq_back_max: maximum allowed backward seek.

12016

-+ * @bfq_slice_idle: maximum idling time.

12017

-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

12018

-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

12019

-+ *                           async queues.

12020

-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

12021

-+ *               to prevent seeky queues to impose long latencies to well

12022

-+ *               behaved ones (this also implies that seeky queues cannot

12023

-+ *               receive guarantees in the service domain; after a timeout

12024

-+ *               they are charged for the whole allocated budget, to try

12025

-+ *               to preserve a behavior reasonably fair among them, but

12026

-+ *               without service-domain guarantees).

12027

-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

12028

-+ *                            queue is multiplied

12029

-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

12030

-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

12031

-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

12032

-+ *			       may be reactivated for a queue (in jiffies)

12033

-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

12034

-+ *				     after which weight-raising may be

12035

-+ *				     reactivated for an already busy queue

12036

-+ *				     (in jiffies)

12037

-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

12038

-+ *			         sectors per seconds

12039

-+ * @RT_prod: cached value of the product R*T used for computing the maximum

12040

-+ *	     duration of the weight raising automatically

12041

-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

12042

-+ *

12043

-+ * All the fields are protected by the @queue lock.

12044

-+ */

12045

-+struct bfq_data {

12046

-+	struct request_queue *queue;

12047

-+

12048

-+	struct bfq_group *root_group;

12049

-+

12050

-+	struct rb_root rq_pos_tree;

12051

-+

12052

-+	int busy_queues;

12053

-+	int raised_busy_queues;

12054

-+	int queued;

12055

-+	int rq_in_driver;

12056

-+	int sync_flight;

12057

-+

12058

-+	int max_rq_in_driver;

12059

-+	int hw_tag_samples;

12060

-+	int hw_tag;

12061

-+

12062

-+	int budgets_assigned;

12063

-+

12064

-+	struct timer_list idle_slice_timer;

12065

-+	struct work_struct unplug_work;

12066

-+

12067

-+	struct bfq_queue *in_service_queue;

12068

-+	struct bfq_io_cq *in_service_bic;

12069

-+

12070

-+	sector_t last_position;

12071

-+

12072

-+	ktime_t last_budget_start;

12073

-+	ktime_t last_idling_start;

12074

-+	int peak_rate_samples;

12075

-+	u64 peak_rate;

12076

-+	unsigned long bfq_max_budget;

12077

-+

12078

-+	struct hlist_head group_list;

12079

-+	struct list_head active_list;

12080

-+	struct list_head idle_list;

12081

-+

12082

-+	unsigned int bfq_quantum;

12083

-+	unsigned int bfq_fifo_expire[2];

12084

-+	unsigned int bfq_back_penalty;

12085

-+	unsigned int bfq_back_max;

12086

-+	unsigned int bfq_slice_idle;

12087

-+	u64 bfq_class_idle_last_service;

12088

-+

12089

-+	unsigned int bfq_user_max_budget;

12090

-+	unsigned int bfq_max_budget_async_rq;

12091

-+	unsigned int bfq_timeout[2];

12092

-+

12093

-+	bool low_latency;

12094

-+

12095

-+	/* parameters of the low_latency heuristics */

12096

-+	unsigned int bfq_raising_coeff;

12097

-+	unsigned int bfq_raising_max_time;

12098

-+	unsigned int bfq_raising_rt_max_time;

12099

-+	unsigned int bfq_raising_min_idle_time;

12100

-+	unsigned long bfq_raising_min_inter_arr_async;

12101

-+	unsigned int bfq_raising_max_softrt_rate;

12102

-+	u64 RT_prod;

12103

-+

12104

-+	struct bfq_queue oom_bfqq;

12105

-+};

12106

-+

12107

-+enum bfqq_state_flags {

12108

-+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

12109

-+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

12110

-+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

12111

-+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

12112

-+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

12113

-+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

12114

-+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

12115

-+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

12116

-+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

12117

-+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

12118

-+	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

12119

-+};

12120

-+

12121

-+#define BFQ_BFQQ_FNS(name)						\

12122

-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

12123

-+{									\

12124

-+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

12125

-+}									\

12126

-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

12127

-+{									\

12128

-+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

12129

-+}									\

12130

-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

12131

-+{									\

12132

-+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

12133

-+}

12134

-+

12135

-+BFQ_BFQQ_FNS(busy);

12136

-+BFQ_BFQQ_FNS(wait_request);

12137

-+BFQ_BFQQ_FNS(must_alloc);

12138

-+BFQ_BFQQ_FNS(fifo_expire);

12139

-+BFQ_BFQQ_FNS(idle_window);

12140

-+BFQ_BFQQ_FNS(prio_changed);

12141

-+BFQ_BFQQ_FNS(sync);

12142

-+BFQ_BFQQ_FNS(budget_new);

12143

-+BFQ_BFQQ_FNS(coop);

12144

-+BFQ_BFQQ_FNS(split_coop);

12145

-+BFQ_BFQQ_FNS(softrt_update);

12146

-+#undef BFQ_BFQQ_FNS

12147

-+

12148

-+/* Logging facilities. */

12149

-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

12150

-+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

12151

-+

12152

-+#define bfq_log(bfqd, fmt, args...) \

12153

-+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

12154

-+

12155

-+/* Expiration reasons. */

12156

-+enum bfqq_expiration {

12157

-+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

12158

-+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

12159

-+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

12160

-+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

12161

-+};

12162

-+

12163

-+#ifdef CONFIG_CGROUP_BFQIO

12164

-+/**

12165

-+ * struct bfq_group - per (device, cgroup) data structure.

12166

-+ * @entity: schedulable entity to insert into the parent group sched_data.

12167

-+ * @sched_data: own sched_data, to contain child entities (they may be

12168

-+ *              both bfq_queues and bfq_groups).

12169

-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data

12170

-+ *              list of the containing cgroup's bfqio_cgroup.

12171

-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list

12172

-+ *             of the groups active on the same device; used for cleanup.

12173

-+ * @bfqd: the bfq_data for the device this group acts upon.

12174

-+ * @async_bfqq: array of async queues for all the tasks belonging to

12175

-+ *              the group, one queue per ioprio value per ioprio_class,

12176

-+ *              except for the idle class that has only one queue.

12177

-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

12178

-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

12179

-+ *             to avoid too many special cases during group creation/migration.

12180

-+ *

12181

-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

12182

-+ * there is a set of bfq_groups, each one collecting the lower-level

12183

-+ * entities belonging to the group that are acting on the same device.

12184

-+ *

12185

-+ * Locking works as follows:

12186

-+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

12187

-+ *      via RCU from its readers.

12188

-+ *    o @bfqd is protected by the queue lock, RCU is used to access it

12189

-+ *      from the readers.

12190

-+ *    o All the other fields are protected by the @bfqd queue lock.

12191

-+ */

12192

-+struct bfq_group {

12193

-+	struct bfq_entity entity;

12194

-+	struct bfq_sched_data sched_data;

12195

-+

12196

-+	struct hlist_node group_node;

12197

-+	struct hlist_node bfqd_node;

12198

-+

12199

-+	void *bfqd;

12200

-+

12201

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12202

-+	struct bfq_queue *async_idle_bfqq;

12203

-+

12204

-+	struct bfq_entity *my_entity;

12205

-+};

12206

-+

12207

-+/**

12208

-+ * struct bfqio_cgroup - bfq cgroup data structure.

12209

-+ * @css: subsystem state for bfq in the containing cgroup.

12210

-+ * @online: flag marked when the subsystem is inserted.

12211

-+ * @weight: cgroup weight.

12212

-+ * @ioprio: cgroup ioprio.

12213

-+ * @ioprio_class: cgroup ioprio_class.

12214

-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

12215

-+ * @group_data: list containing the bfq_group belonging to this cgroup.

12216

-+ *

12217

-+ * @group_data is accessed using RCU, with @lock protecting the updates,

12218

-+ * @ioprio and @ioprio_class are protected by @lock.

12219

-+ */

12220

-+struct bfqio_cgroup {

12221

-+	struct cgroup_subsys_state css;

12222

-+	bool online;

12223

-+

12224

-+	unsigned short weight, ioprio, ioprio_class;

12225

-+

12226

-+	spinlock_t lock;

12227

-+	struct hlist_head group_data;

12228

-+};

12229

-+#else

12230

-+struct bfq_group {

12231

-+	struct bfq_sched_data sched_data;

12232

-+

12233

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12234

-+	struct bfq_queue *async_idle_bfqq;

12235

-+};

12236

-+#endif

12237

-+

12238

-+static inline struct bfq_service_tree *

12239

-+bfq_entity_service_tree(struct bfq_entity *entity)

12240

-+{

12241

-+	struct bfq_sched_data *sched_data = entity->sched_data;

12242

-+	unsigned int idx = entity->ioprio_class - 1;

12243

-+

12244

-+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

12245

-+	BUG_ON(sched_data == NULL);

12246

-+

12247

-+	return sched_data->service_tree + idx;

12248

-+}

12249

-+

12250

-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

12251

-+					    int is_sync)

12252

-+{

12253

-+	return bic->bfqq[!!is_sync];

12254

-+}

12255

-+

12256

-+static inline void bic_set_bfqq(struct bfq_io_cq *bic,

12257

-+				struct bfq_queue *bfqq, int is_sync)

12258

-+{

12259

-+	bic->bfqq[!!is_sync] = bfqq;

12260

-+}

12261

-+

12262

-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

12263

-+{

12264

-+	return bic->icq.q->elevator->elevator_data;

12265

-+}

12266

-+

12267

-+/**

12268

-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

12269

-+ * @ptr: a pointer to a bfqd.

12270

-+ * @flags: storage for the flags to be saved.

12271

-+ *

12272

-+ * This function allows bfqg->bfqd to be protected by the

12273

-+ * queue lock of the bfqd they reference; the pointer is dereferenced

12274

-+ * under RCU, so the storage for bfqd is assured to be safe as long

12275

-+ * as the RCU read side critical section does not end.  After the

12276

-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

12277

-+ * sure that no other writer accessed it.  If we raced with a writer,

12278

-+ * the function returns NULL, with the queue unlocked, otherwise it

12279

-+ * returns the dereferenced pointer, with the queue locked.

12280

-+ */

12281

-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

12282

-+						   unsigned long *flags)

12283

-+{

12284

-+	struct bfq_data *bfqd;

12285

-+

12286

-+	rcu_read_lock();

12287

-+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

12288

-+

12289

-+	if (bfqd != NULL) {

12290

-+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

12291

-+		if (*ptr == bfqd)

12292

-+			goto out;

12293

-+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12294

-+	}

12295

-+

12296

-+	bfqd = NULL;

12297

-+out:

12298

-+	rcu_read_unlock();

12299

-+	return bfqd;

12300

-+}

12301

-+

12302

-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

12303

-+				       unsigned long *flags)

12304

-+{

12305

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12306

-+}

12307

-+

12308

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic);

12309

-+static void bfq_put_queue(struct bfq_queue *bfqq);

12310

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

12311

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

12312

-+				       struct bfq_group *bfqg, int is_sync,

12313

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask);

12314

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

12315

-+					 struct bfq_group *bfqg);

12316

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

12317

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

12318

-+#endif

12319

---

12320

-1.8.5.2

12321

-

12322

12323

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch

12324

===================================================================

12325

--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	2014-03-26 23:50:52 UTC (rev 2715)

12326

+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch	2014-03-31 12:03:14 UTC (rev 2716)

12327

@@ -1,1034 +0,0 @@

12328

-From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001

12329

-From: Mauro Andreolini <mauro.andreolini@×××××××.it>

12330

-Date: Thu, 23 Jan 2014 16:54:44 +0100

12331

-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for

12332

- 3.13.0

12333

-

12334

-A set of processes may happen  to  perform interleaved reads, i.e., requests

12335

-whose union would give rise to a  sequential read  pattern.  There are two

12336

-typical  cases: in the first  case,   processes  read  fixed-size chunks of

12337

-data at a fixed distance from each other, while in the second case processes

12338

-may read variable-size chunks at  variable distances. The latter case occurs

12339

-for  example with  KVM, which  splits the  I/O generated  by the  guest into

12340

-multiple chunks,  and lets these chunks  be served by a  pool of cooperating

12341

-processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

12342

-available  process. CFQ  uses actual  queue merging  for the  first type  of

12343

-rocesses, whereas it  uses preemption to get a sequential  read pattern out

12344

-of the read requests  performed by the second type of  processes. In the end

12345

-it uses  two different  mechanisms to  achieve the  same goal: boosting the

12346

-throughput with interleaved I/O.

12347

-

12348

-This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

12349

-sequential  read pattern  with both  types of  processes. The  main idea  is

12350

-checking newly arrived requests against the next request of the active queue

12351

-both in case of actual request insert and in case of request merge. By doing

12352

-so, both the types of processes can be handled by just merging their queues.

12353

-EQM is  then simpler and  more compact than the  pair of mechanisms used in

12354

-CFQ.

12355

-

12356

-Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

12357

-properly restoring the weight-raising state of  a queue when it gets back to

12358

-a non-merged state.

12359

-

12360

-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

12361

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

12362

-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

12363

----

12364

- block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

12365

- block/bfq-sched.c   |  28 ---

12366

- block/bfq.h         |  16 ++

12367

- 3 files changed, 474 insertions(+), 227 deletions(-)

12368

-

12369

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

12370

-index 7670400..295236e 100644

12371

---- a/block/bfq-iosched.c

12372

-+++ b/block/bfq-iosched.c

12373

-@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

12374

- 	return dur;

12375

- }

12376

-

12377

-+static inline void

12378

-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

12379

-+{

12380

-+	if (bic->saved_idle_window)

12381

-+		bfq_mark_bfqq_idle_window(bfqq);

12382

-+	else

12383

-+		bfq_clear_bfqq_idle_window(bfqq);

12384

-+	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

12385

-+		/*

12386

-+		 * Start a weight raising period with the duration given by

12387

-+		 * the raising_time_left snapshot.

12388

-+		 */

12389

-+		if (bfq_bfqq_busy(bfqq))

12390

-+			bfqq->bfqd->raised_busy_queues++;

12391

-+		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

12392

-+		bfqq->raising_cur_max_time = bic->raising_time_left;

12393

-+		bfqq->last_rais_start_finish = jiffies;

12394

-+		bfqq->entity.ioprio_changed = 1;

12395

-+	}

12396

-+	/*

12397

-+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

12398

-+	 * getting confused about the queue's need of a weight-raising

12399

-+	 * period.

12400

-+	 */

12401

-+	bic->raising_time_left = 0;

12402

-+}

12403

-+

12404

-+/*

12405

-+ * Must be called with the queue_lock held.

12406

-+ */

12407

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

12408

-+{

12409

-+	int process_refs, io_refs;

12410

-+

12411

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12412

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12413

-+	BUG_ON(process_refs < 0);

12414

-+	return process_refs;

12415

-+}

12416

-+

12417

- static void bfq_add_rq_rb(struct request *rq)

12418

- {

12419

- 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

12420

-@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

12421

- 		if (!bfqd->low_latency)

12422

- 			goto add_bfqq_busy;

12423

-

12424

-+		if (bfq_bfqq_just_split(bfqq))

12425

-+			goto set_ioprio_changed;

12426

-+

12427

- 		/*

12428

--		 * If the queue is not being boosted and has been idle

12429

--		 * for enough time, start a weight-raising period

12430

-+		 * If the queue:

12431

-+		 * - is not being boosted,

12432

-+		 * - has been idle for enough time,

12433

-+		 * - is not a sync queue or is linked to a bfq_io_cq (it is

12434

-+		 *   shared "for its nature" or it is not shared and its

12435

-+		 *   requests have not been redirected to a shared queue)

12436

-+		 * start a weight-raising period.

12437

- 		 */

12438

--		if (old_raising_coeff == 1 &&

12439

--		    (idle_for_long_time || soft_rt)) {

12440

-+		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

12441

-+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

12442

- 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

12443

- 			if (idle_for_long_time)

12444

- 				bfqq->raising_cur_max_time =

12445

-@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)

12446

- 					bfqd->bfq_raising_rt_max_time;

12447

- 			}

12448

- 		}

12449

-+set_ioprio_changed:

12450

- 		if (old_raising_coeff != bfqq->raising_coeff)

12451

- 			entity->ioprio_changed = 1;

12452

- add_bfqq_busy:

12453

-@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

12454

- 	spin_unlock_irq(bfqd->queue->queue_lock);

12455

- }

12456

-

12457

--static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12458

--			   struct bio *bio)

12459

--{

12460

--	struct bfq_data *bfqd = q->elevator->elevator_data;

12461

--	struct bfq_io_cq *bic;

12462

--	struct bfq_queue *bfqq;

12463

--

12464

--	/*

12465

--	 * Disallow merge of a sync bio into an async request.

12466

--	 */

12467

--	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12468

--		return 0;

12469

--

12470

--	/*

12471

--	 * Lookup the bfqq that this bio will be queued with. Allow

12472

--	 * merge only if rq is queued there.

12473

--	 * Queue lock is held here.

12474

--	 */

12475

--	bic = bfq_bic_lookup(bfqd, current->io_context);

12476

--	if (bic == NULL)

12477

--		return 0;

12478

--

12479

--	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12480

--	return bfqq == RQ_BFQQ(rq);

12481

--}

12482

--

12483

--static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

12484

--				       struct bfq_queue *bfqq)

12485

--{

12486

--	if (bfqq != NULL) {

12487

--		bfq_mark_bfqq_must_alloc(bfqq);

12488

--		bfq_mark_bfqq_budget_new(bfqq);

12489

--		bfq_clear_bfqq_fifo_expire(bfqq);

12490

--

12491

--		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12492

--

12493

--		bfq_log_bfqq(bfqd, bfqq,

12494

--			     "set_in_service_queue, cur-budget = %lu",

12495

--			     bfqq->entity.budget);

12496

--	}

12497

--

12498

--	bfqd->in_service_queue = bfqq;

12499

--}

12500

--

12501

--/*

12502

-- * Get and set a new queue for service.

12503

-- */

12504

--static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

12505

--						  struct bfq_queue *bfqq)

12506

-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

12507

- {

12508

--	if (!bfqq)

12509

--		bfqq = bfq_get_next_queue(bfqd);

12510

-+	if (request)

12511

-+		return blk_rq_pos(io_struct);

12512

- 	else

12513

--		bfq_get_next_queue_forced(bfqd, bfqq);

12514

--

12515

--	__bfq_set_in_service_queue(bfqd, bfqq);

12516

--	return bfqq;

12517

-+		return ((struct bio *)io_struct)->bi_sector;

12518

- }

12519

-

12520

--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

12521

--					  struct request *rq)

12522

-+static inline sector_t bfq_dist_from(sector_t pos1,

12523

-+				     sector_t pos2)

12524

- {

12525

--	if (blk_rq_pos(rq) >= bfqd->last_position)

12526

--		return blk_rq_pos(rq) - bfqd->last_position;

12527

-+	if (pos1 >= pos2)

12528

-+		return pos1 - pos2;

12529

- 	else

12530

--		return bfqd->last_position - blk_rq_pos(rq);

12531

-+		return pos2 - pos1;

12532

- }

12533

-

12534

--/*

12535

-- * Return true if bfqq has no request pending and rq is close enough to

12536

-- * bfqd->last_position, or if rq is closer to bfqd->last_position than

12537

-- * bfqq->next_rq

12538

-- */

12539

--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

12540

-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

12541

-+					 sector_t sector)

12542

- {

12543

--	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

12544

-+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

12545

-+	       BFQQ_SEEK_THR;

12546

- }

12547

-

12548

--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12549

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

12550

- {

12551

- 	struct rb_root *root = &bfqd->rq_pos_tree;

12552

- 	struct rb_node *parent, *node;

12553

- 	struct bfq_queue *__bfqq;

12554

--	sector_t sector = bfqd->last_position;

12555

-

12556

- 	if (RB_EMPTY_ROOT(root))

12557

- 		return NULL;

12558

-@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12559

- 	 * position).

12560

- 	 */

12561

- 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

12562

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12563

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12564

- 		return __bfqq;

12565

-

12566

- 	if (blk_rq_pos(__bfqq->next_rq) < sector)

12567

-@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12568

- 		return NULL;

12569

-

12570

- 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

12571

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

12572

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

12573

- 		return __bfqq;

12574

-

12575

- 	return NULL;

12576

-@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

12577

- /*

12578

-  * bfqd - obvious

12579

-  * cur_bfqq - passed in so that we don't decide that the current queue

12580

-- *            is closely cooperating with itself.

12581

-- *

12582

-- * We are assuming that cur_bfqq has dispatched at least one request,

12583

-- * and that bfqd->last_position reflects a position on the disk associated

12584

-- * with the I/O issued by cur_bfqq.

12585

-+ *            is closely cooperating with itself

12586

-+ * sector - used as a reference point to search for a close queue

12587

-  */

12588

- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12589

--					      struct bfq_queue *cur_bfqq)

12590

-+					      struct bfq_queue *cur_bfqq,

12591

-+					      sector_t sector)

12592

- {

12593

- 	struct bfq_queue *bfqq;

12594

-

12595

-@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12596

- 	 * working closely on the same area of the disk. In that case,

12597

- 	 * we can group them together and don't waste time idling.

12598

- 	 */

12599

--	bfqq = bfqq_close(bfqd);

12600

-+	bfqq = bfqq_close(bfqd, sector);

12601

- 	if (bfqq == NULL || bfqq == cur_bfqq)

12602

- 		return NULL;

12603

-

12604

-@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

12605

- 	return bfqq;

12606

- }

12607

-

12608

-+static struct bfq_queue *

12609

-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12610

-+{

12611

-+	int process_refs, new_process_refs;

12612

-+	struct bfq_queue *__bfqq;

12613

-+

12614

-+	/*

12615

-+	 * If there are no process references on the new_bfqq, then it is

12616

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12617

-+	 * may have dropped their last reference (not just their last process

12618

-+	 * reference).

12619

-+	 */

12620

-+	if (!bfqq_process_refs(new_bfqq))

12621

-+		return NULL;

12622

-+

12623

-+	/* Avoid a circular list and skip interim queue merges. */

12624

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

12625

-+		if (__bfqq == bfqq)

12626

-+			return NULL;

12627

-+		new_bfqq = __bfqq;

12628

-+	}

12629

-+

12630

-+	process_refs = bfqq_process_refs(bfqq);

12631

-+	new_process_refs = bfqq_process_refs(new_bfqq);

12632

-+	/*

12633

-+	 * If the process for the bfqq has gone away, there is no

12634

-+	 * sense in merging the queues.

12635

-+	 */

12636

-+	if (process_refs == 0 || new_process_refs == 0)

12637

-+		return NULL;

12638

-+

12639

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12640

-+		new_bfqq->pid);

12641

-+

12642

-+	/*

12643

-+	 * Merging is just a redirection: the requests of the process owning

12644

-+	 * one of the two queues are redirected to the other queue. The latter

12645

-+	 * queue, in its turn, is set as shared if this is the first time that

12646

-+	 * the requests of some process are redirected to it.

12647

-+	 *

12648

-+	 * We redirect bfqq to new_bfqq and not the opposite, because we

12649

-+	 * are in the context of the process owning bfqq, hence we have the

12650

-+	 * io_cq of this process. So we can immediately configure this io_cq

12651

-+	 * to redirect the requests of the process to new_bfqq.

12652

-+	 *

12653

-+	 * NOTE, even if new_bfqq coincides with the in-service queue, the

12654

-+	 * io_cq of new_bfqq is not available, because, if the in-service queue

12655

-+	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

12656

-+	 * in-service queue.

12657

-+	 * Redirecting the requests of the process owning bfqq to the currently

12658

-+	 * in-service queue is in any case the best option, as we feed the

12659

-+	 * in-service queue with new requests close to the last request served

12660

-+	 * and, by doing so, hopefully increase the throughput.

12661

-+	 */

12662

-+	bfqq->new_bfqq = new_bfqq;

12663

-+	atomic_add(process_refs, &new_bfqq->ref);

12664

-+	return new_bfqq;

12665

-+}

12666

-+

12667

-+/*

12668

-+ * Attempt to schedule a merge of bfqq with the currently in-service queue or

12669

-+ * with a close queue among the scheduled queues.

12670

-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

12671

-+ * structure otherwise.

12672

-+ */

12673

-+static struct bfq_queue *

12674

-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12675

-+		     void *io_struct, bool request)

12676

-+{

12677

-+	struct bfq_queue *in_service_bfqq, *new_bfqq;

12678

-+

12679

-+	if (bfqq->new_bfqq)

12680

-+		return bfqq->new_bfqq;

12681

-+

12682

-+	if (!io_struct)

12683

-+		return NULL;

12684

-+

12685

-+	in_service_bfqq = bfqd->in_service_queue;

12686

-+

12687

-+	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

12688

-+	    !bfqd->in_service_bic)

12689

-+		goto check_scheduled;

12690

-+

12691

-+	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

12692

-+		goto check_scheduled;

12693

-+

12694

-+	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

12695

-+		goto check_scheduled;

12696

-+

12697

-+	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

12698

-+		goto check_scheduled;

12699

-+

12700

-+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

12701

-+	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

12702

-+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

12703

-+		if (new_bfqq != NULL)

12704

-+			return new_bfqq; /* Merge with the in-service queue */

12705

-+	}

12706

-+

12707

-+	/*

12708

-+	 * Check whether there is a cooperator among currently scheduled

12709

-+	 * queues. The only thing we need is that the bio/request is not

12710

-+	 * NULL, as we need it to establish whether a cooperator exists.

12711

-+	 */

12712

-+check_scheduled:

12713

-+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

12714

-+					bfq_io_struct_pos(io_struct, request));

12715

-+	if (new_bfqq)

12716

-+		return bfq_setup_merge(bfqq, new_bfqq);

12717

-+

12718

-+	return NULL;

12719

-+}

12720

-+

12721

-+static inline void

12722

-+bfq_bfqq_save_state(struct bfq_queue *bfqq)

12723

-+{

12724

-+	/*

12725

-+	 * If bfqq->bic == NULL, the queue is already shared or its requests

12726

-+	 * have already been redirected to a shared queue; both idle window

12727

-+	 * and weight raising state have already been saved. Do nothing.

12728

-+	 */

12729

-+	if (bfqq->bic == NULL)

12730

-+		return;

12731

-+	if (bfqq->bic->raising_time_left)

12732

-+		/*

12733

-+		 * This is the queue of a just-started process, and would

12734

-+		 * deserve weight raising: we set raising_time_left to the full

12735

-+		 * weight-raising duration to trigger weight-raising when and

12736

-+		 * if the queue is split and the first request of the queue

12737

-+		 * is enqueued.

12738

-+		 */

12739

-+		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

12740

-+	else if (bfqq->raising_coeff > 1) {

12741

-+		unsigned long wrais_duration =

12742

-+			jiffies - bfqq->last_rais_start_finish;

12743

-+		/*

12744

-+		 * It may happen that a queue's weight raising period lasts

12745

-+		 * longer than its raising_cur_max_time, as weight raising is

12746

-+		 * handled only when a request is enqueued or dispatched (it

12747

-+		 * does not use any timer). If the weight raising period is

12748

-+		 * about to end, don't save it.

12749

-+		 */

12750

-+		if (bfqq->raising_cur_max_time <= wrais_duration)

12751

-+			bfqq->bic->raising_time_left = 0;

12752

-+		else

12753

-+			bfqq->bic->raising_time_left =

12754

-+				bfqq->raising_cur_max_time - wrais_duration;

12755

-+		/*

12756

-+		 * The bfq_queue is becoming shared or the requests of the

12757

-+		 * process owning the queue are being redirected to a shared

12758

-+		 * queue. Stop the weight raising period of the queue, as in

12759

-+		 * both cases it should not be owned by an interactive or soft

12760

-+		 * real-time application.

12761

-+		 */

12762

-+		bfq_bfqq_end_raising(bfqq);

12763

-+	} else

12764

-+		bfqq->bic->raising_time_left = 0;

12765

-+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

12766

-+}

12767

-+

12768

-+static inline void

12769

-+bfq_get_bic_reference(struct bfq_queue *bfqq)

12770

-+{

12771

-+	/*

12772

-+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

12773

-+	 * is about to begin using a shared bfq_queue.

12774

-+	 */

12775

-+	if (bfqq->bic)

12776

-+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

12777

-+}

12778

-+

12779

-+static void

12780

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

12781

-+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12782

-+{

12783

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

12784

-+		(long unsigned)new_bfqq->pid);

12785

-+	/* Save weight raising and idle window of the merged queues */

12786

-+	bfq_bfqq_save_state(bfqq);

12787

-+	bfq_bfqq_save_state(new_bfqq);

12788

-+	/*

12789

-+	 * Grab a reference to the bic, to prevent it from being destroyed

12790

-+	 * before being possibly touched by a bfq_split_bfqq().

12791

-+	 */

12792

-+	bfq_get_bic_reference(bfqq);

12793

-+	bfq_get_bic_reference(new_bfqq);

12794

-+	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

12795

-+	bic_set_bfqq(bic, new_bfqq, 1);

12796

-+	bfq_mark_bfqq_coop(new_bfqq);

12797

-+	/*

12798

-+	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

12799

-+	 * new_bfqq->bic to NULL. bfqq either:

12800

-+	 * - does not belong to any bic any more, and hence bfqq->bic must

12801

-+	 *   be set to NULL, or

12802

-+	 * - is a queue whose owning bics have already been redirected to a

12803

-+	 *   different queue, hence the queue is destined to not belong to any

12804

-+	 *   bic soon and bfqq->bic is already NULL (therefore the next

12805

-+	 *   assignment causes no harm).

12806

-+	 */

12807

-+	new_bfqq->bic = NULL;

12808

-+	bfqq->bic = NULL;

12809

-+	bfq_put_queue(bfqq);

12810

-+}

12811

-+

12812

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

12813

-+			   struct bio *bio)

12814

-+{

12815

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

12816

-+	struct bfq_io_cq *bic;

12817

-+	struct bfq_queue *bfqq, *new_bfqq;

12818

-+

12819

-+	/*

12820

-+	 * Disallow merge of a sync bio into an async request.

12821

-+	 */

12822

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

12823

-+		return 0;

12824

-+

12825

-+	/*

12826

-+	 * Lookup the bfqq that this bio will be queued with. Allow

12827

-+	 * merge only if rq is queued there.

12828

-+	 * Queue lock is held here.

12829

-+	 */

12830

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

12831

-+	if (bic == NULL)

12832

-+		return 0;

12833

-+

12834

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

12835

-+	/*

12836

-+	 * We take advantage of this function to perform an early merge

12837

-+	 * of the queues of possible cooperating processes.

12838

-+	 */

12839

-+	if (bfqq != NULL) {

12840

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

12841

-+		if (new_bfqq != NULL) {

12842

-+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

12843

-+			/*

12844

-+			 * If we get here, the bio will be queued in the shared queue,

12845

-+			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

12846

-+			 * rq can be merged.

12847

-+			 */

12848

-+			bfqq = new_bfqq;

12849

-+		}

12850

-+	}

12851

-+

12852

-+	return bfqq == RQ_BFQQ(rq);

12853

-+}

12854

-+

12855

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

12856

-+				       struct bfq_queue *bfqq)

12857

-+{

12858

-+	if (bfqq != NULL) {

12859

-+		bfq_mark_bfqq_must_alloc(bfqq);

12860

-+		bfq_mark_bfqq_budget_new(bfqq);

12861

-+		bfq_clear_bfqq_fifo_expire(bfqq);

12862

-+

12863

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

12864

-+

12865

-+		bfq_log_bfqq(bfqd, bfqq,

12866

-+			     "set_in_service_queue, cur-budget = %lu",

12867

-+			     bfqq->entity.budget);

12868

-+	}

12869

-+

12870

-+	bfqd->in_service_queue = bfqq;

12871

-+}

12872

-+

12873

-+/*

12874

-+ * Get and set a new queue for service.

12875

-+ */

12876

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

12877

-+{

12878

-+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

12879

-+

12880

-+	__bfq_set_in_service_queue(bfqd, bfqq);

12881

-+	return bfqq;

12882

-+}

12883

-+

12884

- /*

12885

-  * If enough samples have been computed, return the current max budget

12886

-  * stored in bfqd, which is dynamically updated according to the

12887

-@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

12888

- 	return rq;

12889

- }

12890

-

12891

--/*

12892

-- * Must be called with the queue_lock held.

12893

-- */

12894

--static int bfqq_process_refs(struct bfq_queue *bfqq)

12895

--{

12896

--	int process_refs, io_refs;

12897

--

12898

--	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

12899

--	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

12900

--	BUG_ON(process_refs < 0);

12901

--	return process_refs;

12902

--}

12903

--

12904

--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

12905

--{

12906

--	int process_refs, new_process_refs;

12907

--	struct bfq_queue *__bfqq;

12908

--

12909

--	/*

12910

--	 * If there are no process references on the new_bfqq, then it is

12911

--	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

12912

--	 * may have dropped their last reference (not just their last process

12913

--	 * reference).

12914

--	 */

12915

--	if (!bfqq_process_refs(new_bfqq))

12916

--		return;

12917

--

12918

--	/* Avoid a circular list and skip interim queue merges. */

12919

--	while ((__bfqq = new_bfqq->new_bfqq)) {

12920

--		if (__bfqq == bfqq)

12921

--			return;

12922

--		new_bfqq = __bfqq;

12923

--	}

12924

--

12925

--	process_refs = bfqq_process_refs(bfqq);

12926

--	new_process_refs = bfqq_process_refs(new_bfqq);

12927

--	/*

12928

--	 * If the process for the bfqq has gone away, there is no

12929

--	 * sense in merging the queues.

12930

--	 */

12931

--	if (process_refs == 0 || new_process_refs == 0)

12932

--		return;

12933

--

12934

--	/*

12935

--	 * Merge in the direction of the lesser amount of work.

12936

--	 */

12937

--	if (new_process_refs >= process_refs) {

12938

--		bfqq->new_bfqq = new_bfqq;

12939

--		atomic_add(process_refs, &new_bfqq->ref);

12940

--	} else {

12941

--		new_bfqq->new_bfqq = bfqq;

12942

--		atomic_add(new_process_refs, &bfqq->ref);

12943

--	}

12944

--	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

12945

--		new_bfqq->pid);

12946

--}

12947

--

12948

- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

12949

- {

12950

- 	struct bfq_entity *entity = &bfqq->entity;

12951

-@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

12952

-  */

12953

- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12954

- {

12955

--	struct bfq_queue *bfqq, *new_bfqq = NULL;

12956

-+	struct bfq_queue *bfqq;

12957

- 	struct request *next_rq;

12958

- 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

12959

-

12960

-@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12961

-

12962

- 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

12963

-

12964

--	/*

12965

--         * If another queue has a request waiting within our mean seek

12966

--         * distance, let it run. The expire code will check for close

12967

--         * cooperators and put the close queue at the front of the

12968

--         * service tree. If possible, merge the expiring queue with the

12969

--         * new bfqq.

12970

--         */

12971

--        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

12972

--        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

12973

--                bfq_setup_merge(bfqq, new_bfqq);

12974

--

12975

- 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

12976

- 	    !timer_pending(&bfqd->idle_slice_timer) &&

12977

- 	    !bfq_bfqq_must_idle(bfqq))

12978

-@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

12979

- 				bfq_clear_bfqq_wait_request(bfqq);

12980

- 				del_timer(&bfqd->idle_slice_timer);

12981

- 			}

12982

--			if (new_bfqq == NULL)

12983

--				goto keep_queue;

12984

--			else

12985

--				goto expire;

12986

-+			goto keep_queue;

12987

- 		}

12988

- 	}

12989

-

12990

- 	/*

12991

--	 * No requests pending.  If the in-service queue has no cooperator and

12992

--	 * still has requests in flight (possibly waiting for a completion)

12993

--	 * or is idling for a new request, then keep it.

12994

-+	 * No requests pending.  If the in-service queue still has requests in

12995

-+	 * flight (possibly waiting for a completion) or is idling for a new

12996

-+	 * request, then keep it.

12997

- 	 */

12998

--	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

12999

--	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

13000

-+	if (timer_pending(&bfqd->idle_slice_timer) ||

13001

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

13002

- 		bfqq = NULL;

13003

- 		goto keep_queue;

13004

--	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

13005

--		/*

13006

--		 * Expiring the queue because there is a close cooperator,

13007

--		 * cancel timer.

13008

--		 */

13009

--		bfq_clear_bfqq_wait_request(bfqq);

13010

--		del_timer(&bfqd->idle_slice_timer);

13011

- 	}

13012

-

13013

- 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

13014

- expire:

13015

- 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

13016

- new_queue:

13017

--	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

13018

-+	bfqq = bfq_set_in_service_queue(bfqd);

13019

- 	bfq_log(bfqd, "select_queue: new queue %d returned",

13020

- 		bfqq != NULL ? bfqq->pid : 0);

13021

- keep_queue:

13022

-@@ -1799,9 +1989,8 @@ keep_queue:

13023

- static void bfq_update_raising_data(struct bfq_data *bfqd,

13024

- 				    struct bfq_queue *bfqq)

13025

- {

13026

-+	struct bfq_entity *entity = &bfqq->entity;

13027

- 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

13028

--		struct bfq_entity *entity = &bfqq->entity;

13029

--

13030

- 		bfq_log_bfqq(bfqd, bfqq,

13031

- 			"raising period dur %u/%u msec, "

13032

- 			"old raising coeff %u, w %d(%d)",

13033

-@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13034

- 			"WARN: pending prio change");

13035

- 		/*

13036

- 		 * If too much time has elapsed from the beginning

13037

--		 * of this weight-raising, stop it.

13038

-+		 * of this weight-raising period, stop it.

13039

- 		 */

13040

- 		if (jiffies - bfqq->last_rais_start_finish >

13041

- 			bfqq->raising_cur_max_time) {

13042

-@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

13043

- 				     jiffies_to_msecs(bfqq->

13044

- 					raising_cur_max_time));

13045

- 			bfq_bfqq_end_raising(bfqq);

13046

--			__bfq_entity_update_weight_prio(

13047

--				bfq_entity_service_tree(entity),

13048

--				entity);

13049

- 		}

13050

- 	}

13051

-+	/* Update weight both if it must be raised and if it must be lowered */

13052

-+	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

13053

-+		__bfq_entity_update_weight_prio(

13054

-+			bfq_entity_service_tree(entity),

13055

-+			entity);

13056

- }

13057

-

13058

- /*

13059

-@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)

13060

- 	struct bfq_io_cq *bic = icq_to_bic(icq);

13061

-

13062

- 	bic->ttime.last_end_request = jiffies;

13063

-+	/*

13064

-+	 * A newly created bic indicates that the process has just

13065

-+	 * started doing I/O, and is probably mapping into memory its

13066

-+	 * executable and libraries: it definitely needs weight raising.

13067

-+	 * There is however the possibility that the process performs,

13068

-+	 * for a while, I/O close to some other process. EQM intercepts

13069

-+	 * this behavior and may merge the queue corresponding to the

13070

-+	 * process  with some other queue, BEFORE the weight of the queue

13071

-+	 * is raised. Merged queues are not weight-raised (they are assumed

13072

-+	 * to belong to processes that benefit only from high throughput).

13073

-+	 * If the merge is basically the consequence of an accident, then

13074

-+	 * the queue will be split soon and will get back its old weight.

13075

-+	 * It is then important to write down somewhere that this queue

13076

-+	 * does need weight raising, even if it did not make it to get its

13077

-+	 * weight raised before being merged. To this purpose, we overload

13078

-+	 * the field raising_time_left and assign 1 to it, to mark the queue

13079

-+	 * as needing weight raising.

13080

-+	 */

13081

-+	bic->raising_time_left = 1;

13082

- }

13083

-

13084

- static void bfq_exit_icq(struct io_cq *icq)

13085

-@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)

13086

- 	}

13087

-

13088

- 	if (bic->bfqq[BLK_RW_SYNC]) {

13089

-+		/*

13090

-+		 * If the bic is using a shared queue, put the reference

13091

-+		 * taken on the io_context when the bic started using a

13092

-+		 * shared bfq_queue.

13093

-+		 */

13094

-+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

13095

-+			put_io_context(icq->ioc);

13096

- 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

13097

- 		bic->bfqq[BLK_RW_SYNC] = NULL;

13098

- 	}

13099

-@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

13100

- 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

13101

- 		return;

13102

-

13103

-+	/* Idle window just restored, statistics are meaningless. */

13104

-+	if (bfq_bfqq_just_split(bfqq))

13105

-+		return;

13106

-+

13107

- 	enable_idle = bfq_bfqq_idle_window(bfqq);

13108

-

13109

- 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

13110

-@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13111

- 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

13112

- 	    !BFQQ_SEEKY(bfqq))

13113

- 		bfq_update_idle_window(bfqd, bfqq, bic);

13114

-+	bfq_clear_bfqq_just_split(bfqq);

13115

-

13116

- 	bfq_log_bfqq(bfqd, bfqq,

13117

- 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

13118

-@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13119

- static void bfq_insert_request(struct request_queue *q, struct request *rq)

13120

- {

13121

- 	struct bfq_data *bfqd = q->elevator->elevator_data;

13122

--	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13123

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

13124

-

13125

- 	assert_spin_locked(bfqd->queue->queue_lock);

13126

-+

13127

-+	/*

13128

-+	 * An unplug may trigger a requeue of a request from the device

13129

-+	 * driver: make sure we are in process context while trying to

13130

-+	 * merge two bfq_queues.

13131

-+	 */

13132

-+	if (!in_interrupt()) {

13133

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

13134

-+		if (new_bfqq != NULL) {

13135

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

13136

-+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

13137

-+			/*

13138

-+			 * Release the request's reference to the old bfqq

13139

-+			 * and make sure one is taken to the shared queue.

13140

-+			 */

13141

-+			new_bfqq->allocated[rq_data_dir(rq)]++;

13142

-+			bfqq->allocated[rq_data_dir(rq)]--;

13143

-+			atomic_inc(&new_bfqq->ref);

13144

-+			bfq_put_queue(bfqq);

13145

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

13146

-+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

13147

-+						bfqq, new_bfqq);

13148

-+			rq->elv.priv[1] = new_bfqq;

13149

-+			bfqq = new_bfqq;

13150

-+		}

13151

-+	}

13152

-+

13153

- 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

13154

-

13155

- 	bfq_add_rq_rb(rq);

13156

-

13157

-+	/*

13158

-+	 * Here a newly-created bfq_queue has already started a weight-raising

13159

-+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

13160

-+	 * from assigning it a full weight-raising period. See the detailed

13161

-+	 * comments about this field in bfq_init_icq().

13162

-+	 */

13163

-+	if (bfqq->bic != NULL)

13164

-+		bfqq->bic->raising_time_left = 0;

13165

- 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

13166

- 	list_add_tail(&rq->queuelist, &bfqq->fifo);

13167

-

13168

-@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)

13169

- 	}

13170

- }

13171

-

13172

--static struct bfq_queue *

13173

--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13174

--		struct bfq_queue *bfqq)

13175

--{

13176

--	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13177

--		(long unsigned)bfqq->new_bfqq->pid);

13178

--	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

13179

--	bfq_mark_bfqq_coop(bfqq->new_bfqq);

13180

--	bfq_put_queue(bfqq);

13181

--	return bic_to_bfqq(bic, 1);

13182

--}

13183

--

13184

- /*

13185

-  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

13186

-  * was the last process referring to said bfqq.

13187

-@@ -2649,6 +2894,9 @@ static struct bfq_queue *

13188

- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

13189

- {

13190

- 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

13191

-+

13192

-+	put_io_context(bic->icq.ioc);

13193

-+

13194

- 	if (bfqq_process_refs(bfqq) == 1) {

13195

- 		bfqq->pid = current->pid;

13196

- 		bfq_clear_bfqq_coop(bfqq);

13197

-@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

13198

- 	struct bfq_queue *bfqq;

13199

- 	struct bfq_group *bfqg;

13200

- 	unsigned long flags;

13201

-+	bool split = false;

13202

-

13203

- 	might_sleep_if(gfp_mask & __GFP_WAIT);

13204

-

13205

-@@ -2695,24 +2944,14 @@ new_queue:

13206

- 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

13207

- 		bic_set_bfqq(bic, bfqq, is_sync);

13208

- 	} else {

13209

--		/*

13210

--		 * If the queue was seeky for too long, break it apart.

13211

--		 */

13212

-+		/* If the queue was seeky for too long, break it apart. */

13213

- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

13214

- 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

13215

- 			bfqq = bfq_split_bfqq(bic, bfqq);

13216

-+			split = true;

13217

- 			if (!bfqq)

13218

- 				goto new_queue;

13219

- 		}

13220

--

13221

--		/*

13222

--		 * Check to see if this queue is scheduled to merge with

13223

--		 * another closely cooperating queue. The merging of queues

13224

--		 * happens here as it must be done in process context.

13225

--		 * The reference on new_bfqq was taken in merge_bfqqs.

13226

--		 */

13227

--		if (bfqq->new_bfqq != NULL)

13228

--			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

13229

- 	}

13230

-

13231

- 	bfqq->allocated[rw]++;

13232

-@@ -2723,6 +2962,26 @@ new_queue:

13233

- 	rq->elv.priv[0] = bic;

13234

- 	rq->elv.priv[1] = bfqq;

13235

-

13236

-+	/*

13237

-+	 * If a bfq_queue has only one process reference, it is owned

13238

-+	 * by only one bfq_io_cq: we can set the bic field of the

13239

-+	 * bfq_queue to the address of that structure. Also, if the

13240

-+	 * queue has just been split, mark a flag so that the

13241

-+	 * information is available to the other scheduler hooks.

13242

-+	 */

13243

-+	if (bfqq_process_refs(bfqq) == 1) {

13244

-+		bfqq->bic = bic;

13245

-+		if (split) {

13246

-+			bfq_mark_bfqq_just_split(bfqq);

13247

-+			/*

13248

-+			 * If the queue has just been split from a shared queue,

13249

-+			 * restore the idle window and the possible weight

13250

-+			 * raising period.

13251

-+			 */

13252

-+			bfq_bfqq_resume_state(bfqq, bic);

13253

-+		}

13254

-+	}

13255

-+

13256

- 	spin_unlock_irqrestore(q->queue_lock, flags);

13257

-

13258

- 	return 0;

13259

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

13260

-index 30df81c..47e66a8 100644

13261

---- a/block/bfq-sched.c

13262

-+++ b/block/bfq-sched.c

13263

-@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

13264

- 	return bfqq;

13265

- }

13266

-

13267

--/*

13268

-- * Forced extraction of the given queue.

13269

-- */

13270

--static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

13271

--				      struct bfq_queue *bfqq)

13272

--{

13273

--	struct bfq_entity *entity;

13274

--	struct bfq_sched_data *sd;

13275

--

13276

--	BUG_ON(bfqd->in_service_queue != NULL);

13277

--

13278

--	entity = &bfqq->entity;

13279

--	/*

13280

--	 * Bubble up extraction/update from the leaf to the root.

13281

--	*/

13282

--	for_each_entity(entity) {

13283

--		sd = entity->sched_data;

13284

--		bfq_update_budget(entity);

13285

--		bfq_update_vtime(bfq_entity_service_tree(entity));

13286

--		bfq_active_extract(bfq_entity_service_tree(entity), entity);

13287

--		sd->active_entity = entity;

13288

--		sd->next_active = NULL;

13289

--		entity->service = 0;

13290

--	}

13291

--

13292

--	return;

13293

--}

13294

--

13295

- static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

13296

- {

13297

- 	if (bfqd->in_service_bic != NULL) {

13298

-diff --git a/block/bfq.h b/block/bfq.h

13299

-index 68b28e3..438f560 100644

13300

---- a/block/bfq.h

13301

-+++ b/block/bfq.h

13302

-@@ -192,6 +192,8 @@ struct bfq_group;

13303

-  *                      idle to backlogged

13304

-  * @service_from_backlogged: cumulative service received from the @bfq_queue

13305

-  *                           since the last transition from idle to backlogged

13306

-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

13307

-+ *	 queue is shared

13308

-  *

13309

-  * A bfq_queue is a leaf request queue; it can be associated to an io_context

13310

-  * or more (if it is an async one).  @cgroup holds a reference to the

13311

-@@ -235,6 +237,7 @@ struct bfq_queue {

13312

- 	sector_t last_request_pos;

13313

-

13314

- 	pid_t pid;

13315

-+	struct bfq_io_cq *bic;

13316

-

13317

- 	/* weight-raising fields */

13318

- 	unsigned int raising_cur_max_time;

13319

-@@ -264,12 +267,23 @@ struct bfq_ttime {

13320

-  * @icq: associated io_cq structure

13321

-  * @bfqq: array of two process queues, the sync and the async

13322

-  * @ttime: associated @bfq_ttime struct

13323

-+ * @raising_time_left: snapshot of the time left before weight raising ends

13324

-+ *		       for the sync queue associated to this process; this

13325

-+ *		       snapshot is taken to remember this value while the weight

13326

-+ *		       raising is suspended because the queue is merged with a

13327

-+ *		       shared queue, and is used to set @raising_cur_max_time

13328

-+ *		       when the queue is split from the shared queue and its

13329

-+ *		       weight is raised again

13330

-+ * @saved_idle_window: same purpose as the previous field for the idle window

13331

-  */

13332

- struct bfq_io_cq {

13333

- 	struct io_cq icq; /* must be the first member */

13334

- 	struct bfq_queue *bfqq[2];

13335

- 	struct bfq_ttime ttime;

13336

- 	int ioprio;

13337

-+

13338

-+	unsigned int raising_time_left;

13339

-+	unsigned int saved_idle_window;

13340

- };

13341

-

13342

- /**

13343

-@@ -411,6 +425,7 @@ enum bfqq_state_flags {

13344

- 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

13345

- 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

13346

- 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

13347

-+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

13348

- 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

13349

- };

13350

-

13351

-@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

13352

- BFQ_BFQQ_FNS(budget_new);

13353

- BFQ_BFQQ_FNS(coop);

13354

- BFQ_BFQQ_FNS(split_coop);

13355

-+BFQ_BFQQ_FNS(just_split);

13356

- BFQ_BFQQ_FNS(softrt_update);

13357

- #undef BFQ_BFQQ_FNS

13358

-

13359

---

13360

-1.8.5.2

13361

-

13362

13363

Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch

13364

===================================================================

13365

--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	2014-03-26 23:50:52 UTC (rev 2715)

13366

+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch	2014-03-31 12:03:14 UTC (rev 2716)

13367

@@ -1,1034 +0,0 @@

13368

-From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001

13369

-From: Mauro Andreolini <mauro.andreolini@×××××××.it>

13370

-Date: Thu, 23 Jan 2014 16:54:44 +0100

13371

-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for

13372

- 3.13.0

13373

-

13374

-A set of processes may happen  to  perform interleaved reads, i.e., requests

13375

-whose union would give rise to a  sequential read  pattern.  There are two

13376

-typical  cases: in the first  case,   processes  read  fixed-size chunks of

13377

-data at a fixed distance from each other, while in the second case processes

13378

-may read variable-size chunks at  variable distances. The latter case occurs

13379

-for  example with  KVM, which  splits the  I/O generated  by the  guest into

13380

-multiple chunks,  and lets these chunks  be served by a  pool of cooperating

13381

-processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

13382

-available  process. CFQ  uses actual  queue merging  for the  first type  of

13383

-rocesses, whereas it  uses preemption to get a sequential  read pattern out

13384

-of the read requests  performed by the second type of  processes. In the end

13385

-it uses  two different  mechanisms to  achieve the  same goal: boosting the

13386

-throughput with interleaved I/O.

13387

-

13388

-This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

13389

-sequential  read pattern  with both  types of  processes. The  main idea  is

13390

-checking newly arrived requests against the next request of the active queue

13391

-both in case of actual request insert and in case of request merge. By doing

13392

-so, both the types of processes can be handled by just merging their queues.

13393

-EQM is  then simpler and  more compact than the  pair of mechanisms used in

13394

-CFQ.

13395

-

13396

-Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

13397

-properly restoring the weight-raising state of  a queue when it gets back to

13398

-a non-merged state.

13399

-

13400

-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

13401

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

13402

-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

13403

----

13404

- block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------

13405

- block/bfq-sched.c   |  28 ---

13406

- block/bfq.h         |  16 ++

13407

- 3 files changed, 474 insertions(+), 227 deletions(-)

13408

-

13409

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

13410

-index eb760de..06ee844 100644

13411

---- a/block/bfq-iosched.c

13412

-+++ b/block/bfq-iosched.c

13413

-@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

13414

- 	return dur;

13415

- }

13416

-

13417

-+static inline void

13418

-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

13419

-+{

13420

-+	if (bic->saved_idle_window)

13421

-+		bfq_mark_bfqq_idle_window(bfqq);

13422

-+	else

13423

-+		bfq_clear_bfqq_idle_window(bfqq);

13424

-+	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

13425

-+		/*

13426

-+		 * Start a weight raising period with the duration given by

13427

-+		 * the raising_time_left snapshot.

13428

-+		 */

13429

-+		if (bfq_bfqq_busy(bfqq))

13430

-+			bfqq->bfqd->raised_busy_queues++;

13431

-+		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

13432

-+		bfqq->raising_cur_max_time = bic->raising_time_left;

13433

-+		bfqq->last_rais_start_finish = jiffies;

13434

-+		bfqq->entity.ioprio_changed = 1;

13435

-+	}

13436

-+	/*

13437

-+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

13438

-+	 * getting confused about the queue's need of a weight-raising

13439

-+	 * period.

13440

-+	 */

13441

-+	bic->raising_time_left = 0;

13442

-+}

13443

-+

13444

-+/*

13445

-+ * Must be called with the queue_lock held.

13446

-+ */

13447

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

13448

-+{

13449

-+	int process_refs, io_refs;

13450

-+

13451

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13452

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13453

-+	BUG_ON(process_refs < 0);

13454

-+	return process_refs;

13455

-+}

13456

-+

13457

- static void bfq_add_rq_rb(struct request *rq)

13458

- {

13459

- 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13460

-@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)

13461

- 		if (!bfqd->low_latency)

13462

- 			goto add_bfqq_busy;

13463

-

13464

-+		if (bfq_bfqq_just_split(bfqq))

13465

-+			goto set_ioprio_changed;

13466

-+

13467

- 		/*

13468

--		 * If the queue is not being boosted and has been idle

13469

--		 * for enough time, start a weight-raising period

13470

-+		 * If the queue:

13471

-+		 * - is not being boosted,

13472

-+		 * - has been idle for enough time,

13473

-+		 * - is not a sync queue or is linked to a bfq_io_cq (it is

13474

-+		 *   shared "for its nature" or it is not shared and its

13475

-+		 *   requests have not been redirected to a shared queue)

13476

-+		 * start a weight-raising period.

13477

- 		 */

13478

--		if (old_raising_coeff == 1 &&

13479

--		    (idle_for_long_time || soft_rt)) {

13480

-+		if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

13481

-+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

13482

- 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

13483

- 			if (idle_for_long_time)

13484

- 				bfqq->raising_cur_max_time =

13485

-@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)

13486

- 					bfqd->bfq_raising_rt_max_time;

13487

- 			}

13488

- 		}

13489

-+set_ioprio_changed:

13490

- 		if (old_raising_coeff != bfqq->raising_coeff)

13491

- 			entity->ioprio_changed = 1;

13492

- add_bfqq_busy:

13493

-@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

13494

- 	spin_unlock_irq(bfqd->queue->queue_lock);

13495

- }

13496

-

13497

--static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13498

--			   struct bio *bio)

13499

--{

13500

--	struct bfq_data *bfqd = q->elevator->elevator_data;

13501

--	struct bfq_io_cq *bic;

13502

--	struct bfq_queue *bfqq;

13503

--

13504

--	/*

13505

--	 * Disallow merge of a sync bio into an async request.

13506

--	 */

13507

--	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13508

--		return 0;

13509

--

13510

--	/*

13511

--	 * Lookup the bfqq that this bio will be queued with. Allow

13512

--	 * merge only if rq is queued there.

13513

--	 * Queue lock is held here.

13514

--	 */

13515

--	bic = bfq_bic_lookup(bfqd, current->io_context);

13516

--	if (bic == NULL)

13517

--		return 0;

13518

--

13519

--	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13520

--	return bfqq == RQ_BFQQ(rq);

13521

--}

13522

--

13523

--static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13524

--				       struct bfq_queue *bfqq)

13525

--{

13526

--	if (bfqq != NULL) {

13527

--		bfq_mark_bfqq_must_alloc(bfqq);

13528

--		bfq_mark_bfqq_budget_new(bfqq);

13529

--		bfq_clear_bfqq_fifo_expire(bfqq);

13530

--

13531

--		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13532

--

13533

--		bfq_log_bfqq(bfqd, bfqq,

13534

--			     "set_in_service_queue, cur-budget = %lu",

13535

--			     bfqq->entity.budget);

13536

--	}

13537

--

13538

--	bfqd->in_service_queue = bfqq;

13539

--}

13540

--

13541

--/*

13542

-- * Get and set a new queue for service.

13543

-- */

13544

--static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

13545

--						  struct bfq_queue *bfqq)

13546

-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

13547

- {

13548

--	if (!bfqq)

13549

--		bfqq = bfq_get_next_queue(bfqd);

13550

-+	if (request)

13551

-+		return blk_rq_pos(io_struct);

13552

- 	else

13553

--		bfq_get_next_queue_forced(bfqd, bfqq);

13554

--

13555

--	__bfq_set_in_service_queue(bfqd, bfqq);

13556

--	return bfqq;

13557

-+		return ((struct bio *)io_struct)->bi_sector;

13558

- }

13559

-

13560

--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

13561

--					  struct request *rq)

13562

-+static inline sector_t bfq_dist_from(sector_t pos1,

13563

-+				     sector_t pos2)

13564

- {

13565

--	if (blk_rq_pos(rq) >= bfqd->last_position)

13566

--		return blk_rq_pos(rq) - bfqd->last_position;

13567

-+	if (pos1 >= pos2)

13568

-+		return pos1 - pos2;

13569

- 	else

13570

--		return bfqd->last_position - blk_rq_pos(rq);

13571

-+		return pos2 - pos1;

13572

- }

13573

-

13574

--/*

13575

-- * Return true if bfqq has no request pending and rq is close enough to

13576

-- * bfqd->last_position, or if rq is closer to bfqd->last_position than

13577

-- * bfqq->next_rq

13578

-- */

13579

--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

13580

-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

13581

-+					 sector_t sector)

13582

- {

13583

--	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

13584

-+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

13585

-+	       BFQQ_SEEK_THR;

13586

- }

13587

-

13588

--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13589

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

13590

- {

13591

- 	struct rb_root *root = &bfqd->rq_pos_tree;

13592

- 	struct rb_node *parent, *node;

13593

- 	struct bfq_queue *__bfqq;

13594

--	sector_t sector = bfqd->last_position;

13595

-

13596

- 	if (RB_EMPTY_ROOT(root))

13597

- 		return NULL;

13598

-@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13599

- 	 * position).

13600

- 	 */

13601

- 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

13602

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13603

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13604

- 		return __bfqq;

13605

-

13606

- 	if (blk_rq_pos(__bfqq->next_rq) < sector)

13607

-@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13608

- 		return NULL;

13609

-

13610

- 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

13611

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13612

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13613

- 		return __bfqq;

13614

-

13615

- 	return NULL;

13616

-@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13617

- /*

13618

-  * bfqd - obvious

13619

-  * cur_bfqq - passed in so that we don't decide that the current queue

13620

-- *            is closely cooperating with itself.

13621

-- *

13622

-- * We are assuming that cur_bfqq has dispatched at least one request,

13623

-- * and that bfqd->last_position reflects a position on the disk associated

13624

-- * with the I/O issued by cur_bfqq.

13625

-+ *            is closely cooperating with itself

13626

-+ * sector - used as a reference point to search for a close queue

13627

-  */

13628

- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13629

--					      struct bfq_queue *cur_bfqq)

13630

-+					      struct bfq_queue *cur_bfqq,

13631

-+					      sector_t sector)

13632

- {

13633

- 	struct bfq_queue *bfqq;

13634

-

13635

-@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13636

- 	 * working closely on the same area of the disk. In that case,

13637

- 	 * we can group them together and don't waste time idling.

13638

- 	 */

13639

--	bfqq = bfqq_close(bfqd);

13640

-+	bfqq = bfqq_close(bfqd, sector);

13641

- 	if (bfqq == NULL || bfqq == cur_bfqq)

13642

- 		return NULL;

13643

-

13644

-@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13645

- 	return bfqq;

13646

- }

13647

-

13648

-+static struct bfq_queue *

13649

-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13650

-+{

13651

-+	int process_refs, new_process_refs;

13652

-+	struct bfq_queue *__bfqq;

13653

-+

13654

-+	/*

13655

-+	 * If there are no process references on the new_bfqq, then it is

13656

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13657

-+	 * may have dropped their last reference (not just their last process

13658

-+	 * reference).

13659

-+	 */

13660

-+	if (!bfqq_process_refs(new_bfqq))

13661

-+		return NULL;

13662

-+

13663

-+	/* Avoid a circular list and skip interim queue merges. */

13664

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

13665

-+		if (__bfqq == bfqq)

13666

-+			return NULL;

13667

-+		new_bfqq = __bfqq;

13668

-+	}

13669

-+

13670

-+	process_refs = bfqq_process_refs(bfqq);

13671

-+	new_process_refs = bfqq_process_refs(new_bfqq);

13672

-+	/*

13673

-+	 * If the process for the bfqq has gone away, there is no

13674

-+	 * sense in merging the queues.

13675

-+	 */

13676

-+	if (process_refs == 0 || new_process_refs == 0)

13677

-+		return NULL;

13678

-+

13679

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13680

-+		new_bfqq->pid);

13681

-+

13682

-+	/*

13683

-+	 * Merging is just a redirection: the requests of the process owning

13684

-+	 * one of the two queues are redirected to the other queue. The latter

13685

-+	 * queue, in its turn, is set as shared if this is the first time that

13686

-+	 * the requests of some process are redirected to it.

13687

-+	 *

13688

-+	 * We redirect bfqq to new_bfqq and not the opposite, because we

13689

-+	 * are in the context of the process owning bfqq, hence we have the

13690

-+	 * io_cq of this process. So we can immediately configure this io_cq

13691

-+	 * to redirect the requests of the process to new_bfqq.

13692

-+	 *

13693

-+	 * NOTE, even if new_bfqq coincides with the in-service queue, the

13694

-+	 * io_cq of new_bfqq is not available, because, if the in-service queue

13695

-+	 * is shared, bfqd->in_service_bic may not point to the io_cq of the

13696

-+	 * in-service queue.

13697

-+	 * Redirecting the requests of the process owning bfqq to the currently

13698

-+	 * in-service queue is in any case the best option, as we feed the

13699

-+	 * in-service queue with new requests close to the last request served

13700

-+	 * and, by doing so, hopefully increase the throughput.

13701

-+	 */

13702

-+	bfqq->new_bfqq = new_bfqq;

13703

-+	atomic_add(process_refs, &new_bfqq->ref);

13704

-+	return new_bfqq;

13705

-+}

13706

-+

13707

-+/*

13708

-+ * Attempt to schedule a merge of bfqq with the currently in-service queue or

13709

-+ * with a close queue among the scheduled queues.

13710

-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

13711

-+ * structure otherwise.

13712

-+ */

13713

-+static struct bfq_queue *

13714

-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13715

-+		     void *io_struct, bool request)

13716

-+{

13717

-+	struct bfq_queue *in_service_bfqq, *new_bfqq;

13718

-+

13719

-+	if (bfqq->new_bfqq)

13720

-+		return bfqq->new_bfqq;

13721

-+

13722

-+	if (!io_struct)

13723

-+		return NULL;

13724

-+

13725

-+	in_service_bfqq = bfqd->in_service_queue;

13726

-+

13727

-+	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

13728

-+	    !bfqd->in_service_bic)

13729

-+		goto check_scheduled;

13730

-+

13731

-+	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

13732

-+		goto check_scheduled;

13733

-+

13734

-+	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

13735

-+		goto check_scheduled;

13736

-+

13737

-+	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

13738

-+		goto check_scheduled;

13739

-+

13740

-+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

13741

-+	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

13742

-+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

13743

-+		if (new_bfqq != NULL)

13744

-+			return new_bfqq; /* Merge with the in-service queue */

13745

-+	}

13746

-+

13747

-+	/*

13748

-+	 * Check whether there is a cooperator among currently scheduled

13749

-+	 * queues. The only thing we need is that the bio/request is not

13750

-+	 * NULL, as we need it to establish whether a cooperator exists.

13751

-+	 */

13752

-+check_scheduled:

13753

-+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

13754

-+					bfq_io_struct_pos(io_struct, request));

13755

-+	if (new_bfqq)

13756

-+		return bfq_setup_merge(bfqq, new_bfqq);

13757

-+

13758

-+	return NULL;

13759

-+}

13760

-+

13761

-+static inline void

13762

-+bfq_bfqq_save_state(struct bfq_queue *bfqq)

13763

-+{

13764

-+	/*

13765

-+	 * If bfqq->bic == NULL, the queue is already shared or its requests

13766

-+	 * have already been redirected to a shared queue; both idle window

13767

-+	 * and weight raising state have already been saved. Do nothing.

13768

-+	 */

13769

-+	if (bfqq->bic == NULL)

13770

-+		return;

13771

-+	if (bfqq->bic->raising_time_left)

13772

-+		/*

13773

-+		 * This is the queue of a just-started process, and would

13774

-+		 * deserve weight raising: we set raising_time_left to the full

13775

-+		 * weight-raising duration to trigger weight-raising when and

13776

-+		 * if the queue is split and the first request of the queue

13777

-+		 * is enqueued.

13778

-+		 */

13779

-+		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

13780

-+	else if (bfqq->raising_coeff > 1) {

13781

-+		unsigned long wrais_duration =

13782

-+			jiffies - bfqq->last_rais_start_finish;

13783

-+		/*

13784

-+		 * It may happen that a queue's weight raising period lasts

13785

-+		 * longer than its raising_cur_max_time, as weight raising is

13786

-+		 * handled only when a request is enqueued or dispatched (it

13787

-+		 * does not use any timer). If the weight raising period is

13788

-+		 * about to end, don't save it.

13789

-+		 */

13790

-+		if (bfqq->raising_cur_max_time <= wrais_duration)

13791

-+			bfqq->bic->raising_time_left = 0;

13792

-+		else

13793

-+			bfqq->bic->raising_time_left =

13794

-+				bfqq->raising_cur_max_time - wrais_duration;

13795

-+		/*

13796

-+		 * The bfq_queue is becoming shared or the requests of the

13797

-+		 * process owning the queue are being redirected to a shared

13798

-+		 * queue. Stop the weight raising period of the queue, as in

13799

-+		 * both cases it should not be owned by an interactive or soft

13800

-+		 * real-time application.

13801

-+		 */

13802

-+		bfq_bfqq_end_raising(bfqq);

13803

-+	} else

13804

-+		bfqq->bic->raising_time_left = 0;

13805

-+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

13806

-+}

13807

-+

13808

-+static inline void

13809

-+bfq_get_bic_reference(struct bfq_queue *bfqq)

13810

-+{

13811

-+	/*

13812

-+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

13813

-+	 * is about to begin using a shared bfq_queue.

13814

-+	 */

13815

-+	if (bfqq->bic)

13816

-+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

13817

-+}

13818

-+

13819

-+static void

13820

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13821

-+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13822

-+{

13823

-+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13824

-+		(long unsigned)new_bfqq->pid);

13825

-+	/* Save weight raising and idle window of the merged queues */

13826

-+	bfq_bfqq_save_state(bfqq);

13827

-+	bfq_bfqq_save_state(new_bfqq);

13828

-+	/*

13829

-+	 * Grab a reference to the bic, to prevent it from being destroyed

13830

-+	 * before being possibly touched by a bfq_split_bfqq().

13831

-+	 */

13832

-+	bfq_get_bic_reference(bfqq);

13833

-+	bfq_get_bic_reference(new_bfqq);

13834

-+	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

13835

-+	bic_set_bfqq(bic, new_bfqq, 1);

13836

-+	bfq_mark_bfqq_coop(new_bfqq);

13837

-+	/*

13838

-+	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

13839

-+	 * new_bfqq->bic to NULL. bfqq either:

13840

-+	 * - does not belong to any bic any more, and hence bfqq->bic must

13841

-+	 *   be set to NULL, or

13842

-+	 * - is a queue whose owning bics have already been redirected to a

13843

-+	 *   different queue, hence the queue is destined to not belong to any

13844

-+	 *   bic soon and bfqq->bic is already NULL (therefore the next

13845

-+	 *   assignment causes no harm).

13846

-+	 */

13847

-+	new_bfqq->bic = NULL;

13848

-+	bfqq->bic = NULL;

13849

-+	bfq_put_queue(bfqq);

13850

-+}

13851

-+

13852

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13853

-+			   struct bio *bio)

13854

-+{

13855

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

13856

-+	struct bfq_io_cq *bic;

13857

-+	struct bfq_queue *bfqq, *new_bfqq;

13858

-+

13859

-+	/*

13860

-+	 * Disallow merge of a sync bio into an async request.

13861

-+	 */

13862

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13863

-+		return 0;

13864

-+

13865

-+	/*

13866

-+	 * Lookup the bfqq that this bio will be queued with. Allow

13867

-+	 * merge only if rq is queued there.

13868

-+	 * Queue lock is held here.

13869

-+	 */

13870

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

13871

-+	if (bic == NULL)

13872

-+		return 0;

13873

-+

13874

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13875

-+	/*

13876

-+	 * We take advantage of this function to perform an early merge

13877

-+	 * of the queues of possible cooperating processes.

13878

-+	 */

13879

-+	if (bfqq != NULL) {

13880

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

13881

-+		if (new_bfqq != NULL) {

13882

-+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

13883

-+			/*

13884

-+			 * If we get here, the bio will be queued in the shared queue,

13885

-+			 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

13886

-+			 * rq can be merged.

13887

-+			 */

13888

-+			bfqq = new_bfqq;

13889

-+		}

13890

-+	}

13891

-+

13892

-+	return bfqq == RQ_BFQQ(rq);

13893

-+}

13894

-+

13895

-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

13896

-+				       struct bfq_queue *bfqq)

13897

-+{

13898

-+	if (bfqq != NULL) {

13899

-+		bfq_mark_bfqq_must_alloc(bfqq);

13900

-+		bfq_mark_bfqq_budget_new(bfqq);

13901

-+		bfq_clear_bfqq_fifo_expire(bfqq);

13902

-+

13903

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13904

-+

13905

-+		bfq_log_bfqq(bfqd, bfqq,

13906

-+			     "set_in_service_queue, cur-budget = %lu",

13907

-+			     bfqq->entity.budget);

13908

-+	}

13909

-+

13910

-+	bfqd->in_service_queue = bfqq;

13911

-+}

13912

-+

13913

-+/*

13914

-+ * Get and set a new queue for service.

13915

-+ */

13916

-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

13917

-+{

13918

-+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

13919

-+

13920

-+	__bfq_set_in_service_queue(bfqd, bfqq);

13921

-+	return bfqq;

13922

-+}

13923

-+

13924

- /*

13925

-  * If enough samples have been computed, return the current max budget

13926

-  * stored in bfqd, which is dynamically updated according to the

13927

-@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

13928

- 	return rq;

13929

- }

13930

-

13931

--/*

13932

-- * Must be called with the queue_lock held.

13933

-- */

13934

--static int bfqq_process_refs(struct bfq_queue *bfqq)

13935

--{

13936

--	int process_refs, io_refs;

13937

--

13938

--	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13939

--	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13940

--	BUG_ON(process_refs < 0);

13941

--	return process_refs;

13942

--}

13943

--

13944

--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13945

--{

13946

--	int process_refs, new_process_refs;

13947

--	struct bfq_queue *__bfqq;

13948

--

13949

--	/*

13950

--	 * If there are no process references on the new_bfqq, then it is

13951

--	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13952

--	 * may have dropped their last reference (not just their last process

13953

--	 * reference).

13954

--	 */

13955

--	if (!bfqq_process_refs(new_bfqq))

13956

--		return;

13957

--

13958

--	/* Avoid a circular list and skip interim queue merges. */

13959

--	while ((__bfqq = new_bfqq->new_bfqq)) {

13960

--		if (__bfqq == bfqq)

13961

--			return;

13962

--		new_bfqq = __bfqq;

13963

--	}

13964

--

13965

--	process_refs = bfqq_process_refs(bfqq);

13966

--	new_process_refs = bfqq_process_refs(new_bfqq);

13967

--	/*

13968

--	 * If the process for the bfqq has gone away, there is no

13969

--	 * sense in merging the queues.

13970

--	 */

13971

--	if (process_refs == 0 || new_process_refs == 0)

13972

--		return;

13973

--

13974

--	/*

13975

--	 * Merge in the direction of the lesser amount of work.

13976

--	 */

13977

--	if (new_process_refs >= process_refs) {

13978

--		bfqq->new_bfqq = new_bfqq;

13979

--		atomic_add(process_refs, &new_bfqq->ref);

13980

--	} else {

13981

--		new_bfqq->new_bfqq = bfqq;

13982

--		atomic_add(new_process_refs, &bfqq->ref);

13983

--	}

13984

--	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13985

--		new_bfqq->pid);

13986

--}

13987

--

13988

- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

13989

- {

13990

- 	struct bfq_entity *entity = &bfqq->entity;

13991

-@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

13992

-  */

13993

- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13994

- {

13995

--	struct bfq_queue *bfqq, *new_bfqq = NULL;

13996

-+	struct bfq_queue *bfqq;

13997

- 	struct request *next_rq;

13998

- 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

13999

-

14000

-@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

14001

-

14002

- 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

14003

-

14004

--	/*

14005

--         * If another queue has a request waiting within our mean seek

14006

--         * distance, let it run. The expire code will check for close

14007

--         * cooperators and put the close queue at the front of the

14008

--         * service tree. If possible, merge the expiring queue with the

14009

--         * new bfqq.

14010

--         */

14011

--        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

14012

--        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

14013

--                bfq_setup_merge(bfqq, new_bfqq);

14014

--

14015

- 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

14016

- 	    !timer_pending(&bfqd->idle_slice_timer) &&

14017

- 	    !bfq_bfqq_must_idle(bfqq))

14018

-@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

14019

- 				bfq_clear_bfqq_wait_request(bfqq);

14020

- 				del_timer(&bfqd->idle_slice_timer);

14021

- 			}

14022

--			if (new_bfqq == NULL)

14023

--				goto keep_queue;

14024

--			else

14025

--				goto expire;

14026

-+			goto keep_queue;

14027

- 		}

14028

- 	}

14029

-

14030

- 	/*

14031

--	 * No requests pending.  If the in-service queue has no cooperator and

14032

--	 * still has requests in flight (possibly waiting for a completion)

14033

--	 * or is idling for a new request, then keep it.

14034

-+	 * No requests pending.  If the in-service queue still has requests in

14035

-+	 * flight (possibly waiting for a completion) or is idling for a new

14036

-+	 * request, then keep it.

14037

- 	 */

14038

--	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

14039

--	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

14040

-+	if (timer_pending(&bfqd->idle_slice_timer) ||

14041

-+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

14042

- 		bfqq = NULL;

14043

- 		goto keep_queue;

14044

--	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

14045

--		/*

14046

--		 * Expiring the queue because there is a close cooperator,

14047

--		 * cancel timer.

14048

--		 */

14049

--		bfq_clear_bfqq_wait_request(bfqq);

14050

--		del_timer(&bfqd->idle_slice_timer);

14051

- 	}

14052

-

14053

- 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

14054

- expire:

14055

- 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

14056

- new_queue:

14057

--	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

14058

-+	bfqq = bfq_set_in_service_queue(bfqd);

14059

- 	bfq_log(bfqd, "select_queue: new queue %d returned",

14060

- 		bfqq != NULL ? bfqq->pid : 0);

14061

- keep_queue:

14062

-@@ -1825,9 +2015,8 @@ keep_queue:

14063

- static void bfq_update_raising_data(struct bfq_data *bfqd,

14064

- 				    struct bfq_queue *bfqq)

14065

- {

14066

-+	struct bfq_entity *entity = &bfqq->entity;

14067

- 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

14068

--		struct bfq_entity *entity = &bfqq->entity;

14069

--

14070

- 		bfq_log_bfqq(bfqd, bfqq,

14071

- 			"raising period dur %u/%u msec, "

14072

- 			"old raising coeff %u, w %d(%d)",

14073

-@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

14074

- 			"WARN: pending prio change");

14075

- 		/*

14076

- 		 * If too much time has elapsed from the beginning

14077

--		 * of this weight-raising, stop it.

14078

-+		 * of this weight-raising period, stop it.

14079

- 		 */

14080

- 		if (time_is_before_jiffies(bfqq->last_rais_start_finish +

14081

- 					   bfqq->raising_cur_max_time)) {

14082

-@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,

14083

- 				     jiffies_to_msecs(bfqq->

14084

- 					raising_cur_max_time));

14085

- 			bfq_bfqq_end_raising(bfqq);

14086

--			__bfq_entity_update_weight_prio(

14087

--				bfq_entity_service_tree(entity),

14088

--				entity);

14089

- 		}

14090

- 	}

14091

-+	/* Update weight both if it must be raised and if it must be lowered */

14092

-+	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

14093

-+		__bfq_entity_update_weight_prio(

14094

-+			bfq_entity_service_tree(entity),

14095

-+			entity);

14096

- }

14097

-

14098

- /*

14099

-@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)

14100

- 	struct bfq_io_cq *bic = icq_to_bic(icq);

14101

-

14102

- 	bic->ttime.last_end_request = jiffies;

14103

-+	/*

14104

-+	 * A newly created bic indicates that the process has just

14105

-+	 * started doing I/O, and is probably mapping into memory its

14106

-+	 * executable and libraries: it definitely needs weight raising.

14107

-+	 * There is however the possibility that the process performs,

14108

-+	 * for a while, I/O close to some other process. EQM intercepts

14109

-+	 * this behavior and may merge the queue corresponding to the

14110

-+	 * process  with some other queue, BEFORE the weight of the queue

14111

-+	 * is raised. Merged queues are not weight-raised (they are assumed

14112

-+	 * to belong to processes that benefit only from high throughput).

14113

-+	 * If the merge is basically the consequence of an accident, then

14114

-+	 * the queue will be split soon and will get back its old weight.

14115

-+	 * It is then important to write down somewhere that this queue

14116

-+	 * does need weight raising, even if it did not make it to get its

14117

-+	 * weight raised before being merged. To this purpose, we overload

14118

-+	 * the field raising_time_left and assign 1 to it, to mark the queue

14119

-+	 * as needing weight raising.

14120

-+	 */

14121

-+	bic->raising_time_left = 1;

14122

- }

14123

-

14124

- static void bfq_exit_icq(struct io_cq *icq)

14125

-@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)

14126

- 	}

14127

-

14128

- 	if (bic->bfqq[BLK_RW_SYNC]) {

14129

-+		/*

14130

-+		 * If the bic is using a shared queue, put the reference

14131

-+		 * taken on the io_context when the bic started using a

14132

-+		 * shared bfq_queue.

14133

-+		 */

14134

-+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

14135

-+			put_io_context(icq->ioc);

14136

- 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

14137

- 		bic->bfqq[BLK_RW_SYNC] = NULL;

14138

- 	}

14139

-@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

14140

- 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

14141

- 		return;

14142

-

14143

-+	/* Idle window just restored, statistics are meaningless. */

14144

-+	if (bfq_bfqq_just_split(bfqq))

14145

-+		return;

14146

-+

14147

- 	enable_idle = bfq_bfqq_idle_window(bfqq);

14148

-

14149

- 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

14150

-@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14151

- 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

14152

- 	    !BFQQ_SEEKY(bfqq))

14153

- 		bfq_update_idle_window(bfqd, bfqq, bic);

14154

-+	bfq_clear_bfqq_just_split(bfqq);

14155

-

14156

- 	bfq_log_bfqq(bfqd, bfqq,

14157

- 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

14158

-@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14159

- static void bfq_insert_request(struct request_queue *q, struct request *rq)

14160

- {

14161

- 	struct bfq_data *bfqd = q->elevator->elevator_data;

14162

--	struct bfq_queue *bfqq = RQ_BFQQ(rq);

14163

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

14164

-

14165

- 	assert_spin_locked(bfqd->queue->queue_lock);

14166

-+

14167

-+	/*

14168

-+	 * An unplug may trigger a requeue of a request from the device

14169

-+	 * driver: make sure we are in process context while trying to

14170

-+	 * merge two bfq_queues.

14171

-+	 */

14172

-+	if (!in_interrupt()) {

14173

-+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

14174

-+		if (new_bfqq != NULL) {

14175

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

14176

-+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

14177

-+			/*

14178

-+			 * Release the request's reference to the old bfqq

14179

-+			 * and make sure one is taken to the shared queue.

14180

-+			 */

14181

-+			new_bfqq->allocated[rq_data_dir(rq)]++;

14182

-+			bfqq->allocated[rq_data_dir(rq)]--;

14183

-+			atomic_inc(&new_bfqq->ref);

14184

-+			bfq_put_queue(bfqq);

14185

-+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

14186

-+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

14187

-+						bfqq, new_bfqq);

14188

-+			rq->elv.priv[1] = new_bfqq;

14189

-+			bfqq = new_bfqq;

14190

-+		}

14191

-+	}

14192

-+

14193

- 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

14194

-

14195

- 	bfq_add_rq_rb(rq);

14196

-

14197

-+	/*

14198

-+	 * Here a newly-created bfq_queue has already started a weight-raising

14199

-+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

14200

-+	 * from assigning it a full weight-raising period. See the detailed

14201

-+	 * comments about this field in bfq_init_icq().

14202

-+	 */

14203

-+	if (bfqq->bic != NULL)

14204

-+		bfqq->bic->raising_time_left = 0;

14205

- 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

14206

- 	list_add_tail(&rq->queuelist, &bfqq->fifo);

14207

-

14208

-@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)

14209

- 	}

14210

- }

14211

-

14212

--static struct bfq_queue *

14213

--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

14214

--		struct bfq_queue *bfqq)

14215

--{

14216

--	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

14217

--		(long unsigned)bfqq->new_bfqq->pid);

14218

--	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

14219

--	bfq_mark_bfqq_coop(bfqq->new_bfqq);

14220

--	bfq_put_queue(bfqq);

14221

--	return bic_to_bfqq(bic, 1);

14222

--}

14223

--

14224

- /*

14225

-  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

14226

-  * was the last process referring to said bfqq.

14227

-@@ -2679,6 +2924,9 @@ static struct bfq_queue *

14228

- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

14229

- {

14230

- 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

14231

-+

14232

-+	put_io_context(bic->icq.ioc);

14233

-+

14234

- 	if (bfqq_process_refs(bfqq) == 1) {

14235

- 		bfqq->pid = current->pid;

14236

- 		bfq_clear_bfqq_coop(bfqq);

14237

-@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

14238

- 	struct bfq_queue *bfqq;

14239

- 	struct bfq_group *bfqg;

14240

- 	unsigned long flags;

14241

-+	bool split = false;

14242

-

14243

- 	might_sleep_if(gfp_mask & __GFP_WAIT);

14244

-

14245

-@@ -2725,24 +2974,14 @@ new_queue:

14246

- 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

14247

- 		bic_set_bfqq(bic, bfqq, is_sync);

14248

- 	} else {

14249

--		/*

14250

--		 * If the queue was seeky for too long, break it apart.

14251

--		 */

14252

-+		/* If the queue was seeky for too long, break it apart. */

14253

- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

14254

- 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

14255

- 			bfqq = bfq_split_bfqq(bic, bfqq);

14256

-+			split = true;

14257

- 			if (!bfqq)

14258

- 				goto new_queue;

14259

- 		}

14260

--

14261

--		/*

14262

--		 * Check to see if this queue is scheduled to merge with

14263

--		 * another closely cooperating queue. The merging of queues

14264

--		 * happens here as it must be done in process context.

14265

--		 * The reference on new_bfqq was taken in merge_bfqqs.

14266

--		 */

14267

--		if (bfqq->new_bfqq != NULL)

14268

--			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

14269

- 	}

14270

-

14271

- 	bfqq->allocated[rw]++;

14272

-@@ -2753,6 +2992,26 @@ new_queue:

14273

- 	rq->elv.priv[0] = bic;

14274

- 	rq->elv.priv[1] = bfqq;

14275

-

14276

-+	/*

14277

-+	 * If a bfq_queue has only one process reference, it is owned

14278

-+	 * by only one bfq_io_cq: we can set the bic field of the

14279

-+	 * bfq_queue to the address of that structure. Also, if the

14280

-+	 * queue has just been split, mark a flag so that the

14281

-+	 * information is available to the other scheduler hooks.

14282

-+	 */

14283

-+	if (bfqq_process_refs(bfqq) == 1) {

14284

-+		bfqq->bic = bic;

14285

-+		if (split) {

14286

-+			bfq_mark_bfqq_just_split(bfqq);

14287

-+			/*

14288

-+			 * If the queue has just been split from a shared queue,

14289

-+			 * restore the idle window and the possible weight

14290

-+			 * raising period.

14291

-+			 */

14292

-+			bfq_bfqq_resume_state(bfqq, bic);

14293

-+		}

14294

-+	}

14295

-+

14296

- 	spin_unlock_irqrestore(q->queue_lock, flags);

14297

-

14298

- 	return 0;

14299

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

14300

-index 999b475..e54ea33 100644

14301

---- a/block/bfq-sched.c

14302

-+++ b/block/bfq-sched.c

14303

-@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

14304

- 	return bfqq;

14305

- }

14306

-

14307

--/*

14308

-- * Forced extraction of the given queue.

14309

-- */

14310

--static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

14311

--				      struct bfq_queue *bfqq)

14312

--{

14313

--	struct bfq_entity *entity;

14314

--	struct bfq_sched_data *sd;

14315

--

14316

--	BUG_ON(bfqd->in_service_queue != NULL);

14317

--

14318

--	entity = &bfqq->entity;

14319

--	/*

14320

--	 * Bubble up extraction/update from the leaf to the root.

14321

--	*/

14322

--	for_each_entity(entity) {

14323

--		sd = entity->sched_data;

14324

--		bfq_update_budget(entity);

14325

--		bfq_update_vtime(bfq_entity_service_tree(entity));

14326

--		bfq_active_extract(bfq_entity_service_tree(entity), entity);

14327

--		sd->active_entity = entity;

14328

--		sd->next_active = NULL;

14329

--		entity->service = 0;

14330

--	}

14331

--

14332

--	return;

14333

--}

14334

--

14335

- static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

14336

- {

14337

- 	if (bfqd->in_service_bic != NULL) {

14338

-diff --git a/block/bfq.h b/block/bfq.h

14339

-index f9b5881..0bfad40 100644

14340

---- a/block/bfq.h

14341

-+++ b/block/bfq.h

14342

-@@ -192,6 +192,8 @@ struct bfq_group;

14343

-  *                      idle to backlogged

14344

-  * @service_from_backlogged: cumulative service received from the @bfq_queue

14345

-  *                           since the last transition from idle to backlogged

14346

-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

14347

-+ *	 queue is shared

14348

-  *

14349

-  * A bfq_queue is a leaf request queue; it can be associated to an io_context

14350

-  * or more (if it is an async one).  @cgroup holds a reference to the

14351

-@@ -235,6 +237,7 @@ struct bfq_queue {

14352

- 	sector_t last_request_pos;

14353

-

14354

- 	pid_t pid;

14355

-+	struct bfq_io_cq *bic;

14356

-

14357

- 	/* weight-raising fields */

14358

- 	unsigned long raising_cur_max_time;

14359

-@@ -264,12 +267,23 @@ struct bfq_ttime {

14360

-  * @icq: associated io_cq structure

14361

-  * @bfqq: array of two process queues, the sync and the async

14362

-  * @ttime: associated @bfq_ttime struct

14363

-+ * @raising_time_left: snapshot of the time left before weight raising ends

14364

-+ *		       for the sync queue associated to this process; this

14365

-+ *		       snapshot is taken to remember this value while the weight

14366

-+ *		       raising is suspended because the queue is merged with a

14367

-+ *		       shared queue, and is used to set @raising_cur_max_time

14368

-+ *		       when the queue is split from the shared queue and its

14369

-+ *		       weight is raised again

14370

-+ * @saved_idle_window: same purpose as the previous field for the idle window

14371

-  */

14372

- struct bfq_io_cq {

14373

- 	struct io_cq icq; /* must be the first member */

14374

- 	struct bfq_queue *bfqq[2];

14375

- 	struct bfq_ttime ttime;

14376

- 	int ioprio;

14377

-+

14378

-+	unsigned int raising_time_left;

14379

-+	unsigned int saved_idle_window;

14380

- };

14381

-

14382

- /**

14383

-@@ -411,6 +425,7 @@ enum bfqq_state_flags {

14384

- 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

14385

- 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

14386

- 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

14387

-+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

14388

- 	BFQ_BFQQ_FLAG_softrt_update,	/* needs softrt-next-start update */

14389

- };

14390

-

14391

-@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);

14392

- BFQ_BFQQ_FNS(budget_new);

14393

- BFQ_BFQQ_FNS(coop);

14394

- BFQ_BFQQ_FNS(split_coop);

14395

-+BFQ_BFQQ_FNS(just_split);

14396

- BFQ_BFQQ_FNS(softrt_update);

14397

- #undef BFQ_BFQQ_FNS

14398

-

14399

---

14400

-1.8.5.2

14401

-

14402

14403

Deleted: genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch

14404

===================================================================

14405

--- genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch	2014-03-26 23:50:52 UTC (rev 2715)

14406

+++ genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch	2014-03-31 12:03:14 UTC (rev 2716)

14407

@@ -1,325 +0,0 @@

14408

-This patch has been tested on and known to work with kernel versions from 3.2

14409

-up to the latest git version (pulled on 12/14/2013).

14410

-

14411

-This patch will expand the number of microarchitectures to include new

14412

-processors including: AMD K10-family, AMD Family 10h (Barcelona), AMD Family

14413

-14h (Bobcat), AMD Family 15h (Bulldozer), AMD Family 15h (Piledriver), AMD

14414

-Family 16h (Jaguar), Intel 1st Gen Core i3/i5/i7 (Nehalem), Intel 2nd Gen Core

14415

-i3/i5/i7 (Sandybridge), Intel 3rd Gen Core i3/i5/i7 (Ivybridge), and Intel 4th

14416

-Gen Core i3/i5/i7 (Haswell). It also offers the compiler the 'native' flag.

14417

-

14418

-Small but real speed increases are measurable using a make endpoint comparing

14419

-a generic kernel to one built with one of the respective microarchs.

14420

-

14421

-See the following experimental evidence supporting this statement:

14422

-https://github.com/graysky2/kernel_gcc_patch

14423

-

14424

----

14425

-diff -uprN a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h

14426

---- a/arch/x86/include/asm/module.h	2013-11-03 18:41:51.000000000 -0500

14427

-+++ b/arch/x86/include/asm/module.h	2013-12-15 06:21:24.351122516 -0500

14428

-@@ -15,6 +15,16 @@

14429

- #define MODULE_PROC_FAMILY "586MMX "

14430

- #elif defined CONFIG_MCORE2

14431

- #define MODULE_PROC_FAMILY "CORE2 "

14432

-+#elif defined CONFIG_MNATIVE

14433

-+#define MODULE_PROC_FAMILY "NATIVE "

14434

-+#elif defined CONFIG_MCOREI7

14435

-+#define MODULE_PROC_FAMILY "COREI7 "

14436

-+#elif defined CONFIG_MCOREI7AVX

14437

-+#define MODULE_PROC_FAMILY "COREI7AVX "

14438

-+#elif defined CONFIG_MCOREAVXI

14439

-+#define MODULE_PROC_FAMILY "COREAVXI "

14440

-+#elif defined CONFIG_MCOREAVX2

14441

-+#define MODULE_PROC_FAMILY "COREAVX2 "

14442

- #elif defined CONFIG_MATOM

14443

- #define MODULE_PROC_FAMILY "ATOM "

14444

- #elif defined CONFIG_M686

14445

-@@ -33,6 +43,18 @@

14446

- #define MODULE_PROC_FAMILY "K7 "

14447

- #elif defined CONFIG_MK8

14448

- #define MODULE_PROC_FAMILY "K8 "

14449

-+#elif defined CONFIG_MK10

14450

-+#define MODULE_PROC_FAMILY "K10 "

14451

-+#elif defined CONFIG_MBARCELONA

14452

-+#define MODULE_PROC_FAMILY "BARCELONA "

14453

-+#elif defined CONFIG_MBOBCAT

14454

-+#define MODULE_PROC_FAMILY "BOBCAT "

14455

-+#elif defined CONFIG_MBULLDOZER

14456

-+#define MODULE_PROC_FAMILY "BULLDOZER "

14457

-+#elif defined CONFIG_MPILEDRIVER

14458

-+#define MODULE_PROC_FAMILY "PILEDRIVER "

14459

-+#elif defined CONFIG_MJAGUAR

14460

-+#define MODULE_PROC_FAMILY "JAGUAR "

14461

- #elif defined CONFIG_MELAN

14462

- #define MODULE_PROC_FAMILY "ELAN "

14463

- #elif defined CONFIG_MCRUSOE

14464

-diff -uprN a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu

14465

---- a/arch/x86/Kconfig.cpu	2013-11-03 18:41:51.000000000 -0500

14466

-+++ b/arch/x86/Kconfig.cpu	2013-12-15 06:21:24.351122516 -0500

14467

-@@ -139,7 +139,7 @@ config MPENTIUM4

14468

-

14469

-

14470

- config MK6

14471

--	bool "K6/K6-II/K6-III"

14472

-+	bool "AMD K6/K6-II/K6-III"

14473

- 	depends on X86_32

14474

- 	---help---

14475

- 	  Select this for an AMD K6-family processor.  Enables use of

14476

-@@ -147,7 +147,7 @@ config MK6

14477

- 	  flags to GCC.

14478

-

14479

- config MK7

14480

--	bool "Athlon/Duron/K7"

14481

-+	bool "AMD Athlon/Duron/K7"

14482

- 	depends on X86_32

14483

- 	---help---

14484

- 	  Select this for an AMD Athlon K7-family processor.  Enables use of

14485

-@@ -155,12 +155,55 @@ config MK7

14486

- 	  flags to GCC.

14487

-

14488

- config MK8

14489

--	bool "Opteron/Athlon64/Hammer/K8"

14490

-+	bool "AMD Opteron/Athlon64/Hammer/K8"

14491

- 	---help---

14492

- 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.

14493

- 	  Enables use of some extended instructions, and passes appropriate

14494

- 	  optimization flags to GCC.

14495

-

14496

-+config MK10

14497

-+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"

14498

-+	---help---

14499

-+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,

14500

-+		Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.

14501

-+	  Enables use of some extended instructions, and passes appropriate

14502

-+	  optimization flags to GCC.

14503

-+

14504

-+config MBARCELONA

14505

-+	bool "AMD Barcelona"

14506

-+	---help---

14507

-+	  Select this for AMD Barcelona and newer processors.

14508

-+

14509

-+	  Enables -march=barcelona

14510

-+

14511

-+config MBOBCAT

14512

-+	bool "AMD Bobcat"

14513

-+	---help---

14514

-+	  Select this for AMD Bobcat processors.

14515

-+

14516

-+	  Enables -march=btver1

14517

-+

14518

-+config MBULLDOZER

14519

-+	bool "AMD Bulldozer"

14520

-+	---help---

14521

-+	  Select this for AMD Bulldozer processors.

14522

-+

14523

-+	  Enables -march=bdver1

14524

-+

14525

-+config MPILEDRIVER

14526

-+	bool "AMD Piledriver"

14527

-+	---help---

14528

-+	  Select this for AMD Piledriver processors.

14529

-+

14530

-+	  Enables -march=bdver2

14531

-+

14532

-+config MJAGUAR

14533

-+	bool "AMD Jaguar"

14534

-+	---help---

14535

-+	  Select this for AMD Jaguar processors.

14536

-+

14537

-+	  Enables -march=btver2

14538

-+

14539

- config MCRUSOE

14540

- 	bool "Crusoe"

14541

- 	depends on X86_32

14542

-@@ -251,8 +294,17 @@ config MPSC

14543

- 	  using the cpu family field

14544

- 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.

14545

-

14546

-+config MATOM

14547

-+	bool "Intel Atom"

14548

-+	---help---

14549

-+

14550

-+	  Select this for the Intel Atom platform. Intel Atom CPUs have an

14551

-+	  in-order pipelining architecture and thus can benefit from

14552

-+	  accordingly optimized code. Use a recent GCC with specific Atom

14553

-+	  support in order to fully benefit from selecting this option.

14554

-+

14555

- config MCORE2

14556

--	bool "Core 2/newer Xeon"

14557

-+	bool "Intel Core 2"

14558

- 	---help---

14559

-

14560

- 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and

14561

-@@ -260,14 +312,40 @@ config MCORE2

14562

- 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15

14563

- 	  (not a typo)

14564

-

14565

--config MATOM

14566

--	bool "Intel Atom"

14567

-+	  Enables -march=core2

14568

-+

14569

-+config MCOREI7

14570

-+	bool "Intel Core i7"

14571

- 	---help---

14572

-

14573

--	  Select this for the Intel Atom platform. Intel Atom CPUs have an

14574

--	  in-order pipelining architecture and thus can benefit from

14575

--	  accordingly optimized code. Use a recent GCC with specific Atom

14576

--	  support in order to fully benefit from selecting this option.

14577

-+	  Select this for the Intel Nehalem platform. Intel Nehalem proecessors

14578

-+	  include Core i3, i5, i7, Xeon: 34xx, 35xx, 55xx, 56xx, 75xx processors.

14579

-+

14580

-+	  Enables -march=corei7

14581

-+

14582

-+config MCOREI7AVX

14583

-+	bool "Intel Core 2nd Gen AVX"

14584

-+	---help---

14585

-+

14586

-+	  Select this for 2nd Gen Core processors including Sandy Bridge.

14587

-+

14588

-+	  Enables -march=corei7-avx

14589

-+

14590

-+config MCOREAVXI

14591

-+	bool "Intel Core 3rd Gen AVX"

14592

-+	---help---

14593

-+

14594

-+	  Select this for 3rd Gen Core processors including Ivy Bridge.

14595

-+

14596

-+	  Enables -march=core-avx-i

14597

-+

14598

-+config MCOREAVX2

14599

-+	bool "Intel Core AVX2"

14600

-+	---help---

14601

-+

14602

-+	  Select this for AVX2 enabled processors including Haswell.

14603

-+

14604

-+	  Enables -march=core-avx2

14605

-

14606

- config GENERIC_CPU

14607

- 	bool "Generic-x86-64"

14608

-@@ -276,6 +354,19 @@ config GENERIC_CPU

14609

- 	  Generic x86-64 CPU.

14610

- 	  Run equally well on all x86-64 CPUs.

14611

-

14612

-+config MNATIVE

14613

-+ bool "Native optimizations autodetected by GCC"

14614

-+ ---help---

14615

-+

14616

-+   GCC 4.2 and above support -march=native, which automatically detects

14617

-+   the optimum settings to use based on your processor. -march=native 

14618

-+   also detects and applies additional settings beyond -march specific

14619

-+   to your CPU, (eg. -msse4). Unless you have a specific reason not to

14620

-+   (e.g. distcc cross-compiling), you should probably be using

14621

-+   -march=native rather than anything listed below.

14622

-+

14623

-+   Enables -march=native

14624

-+

14625

- endchoice

14626

-

14627

- config X86_GENERIC

14628

-@@ -300,7 +391,7 @@ config X86_INTERNODE_CACHE_SHIFT

14629

- config X86_L1_CACHE_SHIFT

14630

- 	int

14631

- 	default "7" if MPENTIUM4 || MPSC

14632

--	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU

14633

-+	default "6" if MK7 || MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MPENTIUMM || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MATOM || MVIAC7 || X86_GENERIC || MNATIVE || GENERIC_CPU

14634

- 	default "4" if MELAN || M486 || MGEODEGX1

14635

- 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX

14636

-

14637

-@@ -331,11 +422,11 @@ config X86_ALIGNMENT_16

14638

-

14639

- config X86_INTEL_USERCOPY

14640

- 	def_bool y

14641

--	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2

14642

-+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || MNATIVE || X86_GENERIC || MK8 || MK7 || MK10 || MBARCELONA || MEFFICEON || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2

14643

-

14644

- config X86_USE_PPRO_CHECKSUM

14645

- 	def_bool y

14646

--	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM

14647

-+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MATOM || MNATIVE

14648

-

14649

- config X86_USE_3DNOW

14650

- 	def_bool y

14651

-@@ -363,17 +454,17 @@ config X86_P6_NOP

14652

-

14653

- config X86_TSC

14654

- 	def_bool y

14655

--	depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64

14656

-+	depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MCOREI7 || MCOREI7-AVX || MATOM) && !X86_NUMAQ) || X86_64 || MNATIVE

14657

-

14658

- config X86_CMPXCHG64

14659

- 	def_bool y

14660

--	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM

14661

-+	depends on X86_PAE || X86_64 || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE

14662

-

14663

- # this should be set for all -march=.. options where the compiler

14664

- # generates cmov.

14665

- config X86_CMOV

14666

- 	def_bool y

14667

--	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)

14668

-+	depends on (MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MK7 || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)

14669

-

14670

- config X86_MINIMUM_CPU_FAMILY

14671

- 	int

14672

-diff -uprN a/arch/x86/Makefile b/arch/x86/Makefile

14673

---- a/arch/x86/Makefile	2013-11-03 18:41:51.000000000 -0500

14674

-+++ b/arch/x86/Makefile	2013-12-15 06:21:24.354455723 -0500

14675

-@@ -61,11 +61,26 @@ else

14676

- 	KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)

14677

-

14678

-         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)

14679

-+        cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)

14680

-         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)

14681

-+        cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)

14682

-+        cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)

14683

-+        cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)

14684

-+        cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)

14685

-+        cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)

14686

-+        cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)

14687

-         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)

14688

-

14689

-         cflags-$(CONFIG_MCORE2) += \

14690

--                $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))

14691

-+                $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))

14692

-+        cflags-$(CONFIG_MCOREI7) += \

14693

-+                $(call cc-option,-march=corei7,$(call cc-option,-mtune=corei7))

14694

-+        cflags-$(CONFIG_MCOREI7AVX) += \

14695

-+                $(call cc-option,-march=corei7-avx,$(call cc-option,-mtune=corei7-avx))

14696

-+        cflags-$(CONFIG_MCOREAVXI) += \

14697

-+                $(call cc-option,-march=core-avx-i,$(call cc-option,-mtune=core-avx-i))

14698

-+        cflags-$(CONFIG_MCOREAVX2) += \

14699

-+                $(call cc-option,-march=core-avx2,$(call cc-option,-mtune=core-avx2))

14700

- 	cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \

14701

- 		$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))

14702

-         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)

14703

-diff -uprN a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu

14704

---- a/arch/x86/Makefile_32.cpu	2013-11-03 18:41:51.000000000 -0500

14705

-+++ b/arch/x86/Makefile_32.cpu	2013-12-15 06:21:24.354455723 -0500

14706

-@@ -23,7 +23,14 @@ cflags-$(CONFIG_MK6)		+= -march=k6

14707

- # Please note, that patches that add -march=athlon-xp and friends are pointless.

14708

- # They make zero difference whatsosever to performance at this time.

14709

- cflags-$(CONFIG_MK7)		+= -march=athlon

14710

-+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)

14711

- cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)

14712

-+cflags-$(CONFIG_MK10)	+= $(call cc-option,-march=amdfam10,-march=athlon)

14713

-+cflags-$(CONFIG_MBARCELONA)	+= $(call cc-option,-march=barcelona,-march=athlon)

14714

-+cflags-$(CONFIG_MBOBCAT)	+= $(call cc-option,-march=btver1,-march=athlon)

14715

-+cflags-$(CONFIG_MBULLDOZER)	+= $(call cc-option,-march=bdver1,-march=athlon)

14716

-+cflags-$(CONFIG_MPILEDRIVER)	+= $(call cc-option,-march=bdver2,-march=athlon)

14717

-+cflags-$(CONFIG_MJAGUAR)	+= $(call cc-option,-march=btver2,-march=athlon)

14718

- cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0

14719

- cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0

14720

- cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)

14721

-@@ -32,6 +39,10 @@ cflags-$(CONFIG_MCYRIXIII)	+= $(call cc-

14722

- cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)

14723

- cflags-$(CONFIG_MVIAC7)		+= -march=i686

14724

- cflags-$(CONFIG_MCORE2)		+= -march=i686 $(call tune,core2)

14725

-+cflags-$(CONFIG_MCOREI7)	+= -march=i686 $(call tune,corei7)

14726

-+cflags-$(CONFIG_MCOREI7AVX)	+= -march=i686 $(call tune,corei7-avx)

14727

-+cflags-$(CONFIG_MCOREAVXI)	+= -march=i686 $(call tune,core-avx-i)

14728

-+cflags-$(CONFIG_MCOREAVX2)	+= -march=i686 $(call tune,core-avx2)

14729

- cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \

14730

- 	$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))

14731

-

14732

-

Gentoo Archives: gentoo-commits