[gentoo-commits] linux-patches r2565 - genpatches-2.6/trunk/3.12 - gentoo-commits

From:	"Tom Wijsman (tomwij)" <tomwij@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] linux-patches r2565 - genpatches-2.6/trunk/3.12
Date:	Mon, 04 Nov 2013 10:09:40
Message-Id:	`20131104100931.E34C52004B@flycatcher.gentoo.org`

1

Author: tomwij

2

Date: 2013-11-04 10:09:31 +0000 (Mon, 04 Nov 2013)

3

New Revision: 2565

4

5

Added:

6

   genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

7

   genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

8

   genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

9

   genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

10

Removed:

11

   genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

12

   genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

13

   genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

14

Modified:

15

   genpatches-2.6/trunk/3.12/0000_README

16

   genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch

17

   genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch

18

   genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch

19

Log:

20

Updated patches to work with 3.12, updated Kconfig to remove HOTPLUG for systemd.

21

22

Modified: genpatches-2.6/trunk/3.12/0000_README

23

===================================================================

24

--- genpatches-2.6/trunk/3.12/0000_README	2013-11-04 00:52:35 UTC (rev 2564)

25

+++ genpatches-2.6/trunk/3.12/0000_README	2013-11-04 10:09:31 UTC (rev 2565)

26

@@ -82,14 +82,18 @@

27

 From:   Tom Wijsman <TomWij@g.o>

28

 Desc:   Add Gentoo Linux support config settings and defaults.

29

30

-Patch:  5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

31

+Patch:  5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

32

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

33

 Desc:   BFQ v6r2 patch 1 for 3.11: Build, cgroups and kconfig bits

34

35

-Patch:  5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

36

+Patch:  5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

37

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

38

 Desc:   BFQ v6r2 patch 2 for 3.10: BFQ Scheduler

39

40

-Patch:  5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

41

+Patch:  5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

42

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

43

 Desc:   BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM)

44

+

45

+Patch:  5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

46

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

47

+Desc:   BFQ v6r2 for 3.11.0 to BFQ v6r2 for 3.12.0.

48

49

Modified: genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch

50

===================================================================

51

--- genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch	2013-11-04 00:52:35 UTC (rev 2564)

52

+++ genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch	2013-11-04 10:09:31 UTC (rev 2565)

53

@@ -2,20 +2,6 @@

54

 index cb96296..6c242ed 100644

55

 --- a/drivers/acpi/blacklist.c

56

 +++ b/drivers/acpi/blacklist.c

57

-@@ -193,6 +193,13 @@  static int __init dmi_disable_osi_win7(const struct dmi_system_id *d)

58

- 	return 0;

59

- }

60

-

61

-+static int __init dmi_disable_osi_win8(const struct dmi_system_id *d)

62

-+{

63

-+	printk(KERN_NOTICE PREFIX "DMI detected: %s\n", d->ident);

64

-+	acpi_osi_setup("!Windows 2012");

65

-+	return 0;

66

-+}

67

-+

68

- static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {

69

- 	{

70

- 	.callback = dmi_disable_osi_vista,

71

 @@ -269,6 +276,61 @@  static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {

72

},

73

74

75

Modified: genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch

76

===================================================================

77

--- genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch	2013-11-04 00:52:35 UTC (rev 2564)

78

+++ genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch	2013-11-04 10:09:31 UTC (rev 2565)

79

@@ -3,9 +3,9 @@

80

 --- a/arch/arm/configs/kirkwood_defconfig

81

 +++ b/arch/arm/configs/kirkwood_defconfig

82

 @@ -20,6 +20,7 @@ CONFIG_MACH_NET2BIG_V2=y

83

+ CONFIG_MACH_D2NET_V2=y

84

+ CONFIG_MACH_NET2BIG_V2=y

85

  CONFIG_MACH_NET5BIG_V2=y

86

- CONFIG_MACH_NETSPACE_MAX_V2=y

87

- CONFIG_MACH_NETSPACE_V2=y

88

 +CONFIG_MACH_POGO_E02=n

89

  CONFIG_MACH_OPENRD_BASE=y

90

  CONFIG_MACH_OPENRD_CLIENT=y

91

@@ -35,13 +35,13 @@

92

 --- a/arch/arm/mach-kirkwood/Makefile

93

 +++ b/arch/arm/mach-kirkwood/Makefile

94

 @@ -2,6 +2,7 @@ obj-y				+= common.o irq.o pcie.o mpp.o

95

-

96

  obj-$(CONFIG_MACH_D2NET_V2)		+= d2net_v2-setup.o lacie_v2-common.o

97

- obj-$(CONFIG_MACH_DOCKSTAR)		+= dockstar-setup.o

98

+ obj-$(CONFIG_MACH_NET2BIG_V2)		+= netxbig_v2-setup.o lacie_v2-common.o

99

+ obj-$(CONFIG_MACH_NET5BIG_V2)		+= netxbig_v2-setup.o lacie_v2-common.o

100

 +obj-$(CONFIG_MACH_POGO_E02)		+= pogo_e02-setup.o

101

- obj-$(CONFIG_MACH_ESATA_SHEEVAPLUG)	+= sheevaplug-setup.o

102

- obj-$(CONFIG_MACH_GURUPLUG)		+= guruplug-setup.o

103

- obj-$(CONFIG_MACH_INETSPACE_V2)		+= netspace_v2-setup.o lacie_v2-common.o

104

+ obj-$(CONFIG_MACH_OPENRD)		+= openrd-setup.o

105

+ obj-$(CONFIG_MACH_RD88F6192_NAS)	+= rd88f6192-nas-setup.o

106

+ obj-$(CONFIG_MACH_RD88F6281)		+= rd88f6281-setup.o

107

 diff --git a/arch/arm/mach-kirkwood/pogo_e02-setup.c b/arch/arm/mach-kirkwood/pogo_e02-setup.c

108

 new file mode 100644

109

 index 0000000..f57e8f7

110

111

Modified: genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch

112

===================================================================

113

--- genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch	2013-11-04 00:52:35 UTC (rev 2564)

114

+++ genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch	2013-11-04 10:09:31 UTC (rev 2565)

115

@@ -9,7 +9,7 @@

116

  source "arch/$SRCARCH/Kconfig"

117

 --- /dev/null

118

 +++ b/distro/Kconfig

119

-@@ -0,0 +1,109 @@

120

+@@ -0,0 +1,107 @@

121

 +menu "Gentoo Linux"

122

+

123

 +config GENTOO_LINUX

124

@@ -35,7 +35,6 @@

125

 +	select TMPFS

126

+

127

 +	select MMU

128

-+	select HOTPLUG

129

 +	select SHMEM

130

+

131

 +	help

132

@@ -91,7 +90,6 @@

133

 +	select EPOLL

134

 +	select FANOTIFY

135

 +	select FHANDLE

136

-+	select HOTPLUG

137

 +	select INOTIFY_USER

138

 +	select NET

139

 +	select PROC_FS

140

141

Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

142

===================================================================

143

--- genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	2013-11-04 00:52:35 UTC (rev 2564)

144

+++ genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	2013-11-04 10:09:31 UTC (rev 2565)

145

@@ -1,97 +0,0 @@

146

-From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001

147

-From: Arianna Avanzini <avanzini.arianna@×××××.com>

148

-Date: Tue, 3 Sep 2013 16:50:42 +0200

149

-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11

150

-

151

-Update Kconfig.iosched and do the related Makefile changes to include

152

-kernel configuration options for BFQ. Also add the bfqio controller

153

-to the cgroups subsystem.

154

-

155

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

156

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

157

----

158

- block/Kconfig.iosched         | 25 +++++++++++++++++++++++++

159

- block/Makefile                |  1 +

160

- include/linux/cgroup_subsys.h |  4 ++++

161

- 3 files changed, 30 insertions(+)

162

-

163

-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

164

-index 421bef9..695e064 100644

165

---- a/block/Kconfig.iosched

166

-+++ b/block/Kconfig.iosched

167

-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

168

- 	---help---

169

- 	  Enable group IO scheduling in CFQ.

170

-

171

-+config IOSCHED_BFQ

172

-+	tristate "BFQ I/O scheduler"

173

-+	default n

174

-+	---help---

175

-+	  The BFQ I/O scheduler tries to distribute bandwidth among

176

-+	  all processes according to their weights.

177

-+	  It aims at distributing the bandwidth as desired, independently of

178

-+	  the disk parameters and with any workload. It also tries to

179

-+	  guarantee low latency to interactive and soft real-time

180

-+	  applications.  If compiled built-in (saying Y here), BFQ can

181

-+	  be configured to support hierarchical scheduling.

182

-+

183

-+config CGROUP_BFQIO

184

-+	bool "BFQ hierarchical scheduling support"

185

-+	depends on CGROUPS && IOSCHED_BFQ=y

186

-+	default n

187

-+	---help---

188

-+	  Enable hierarchical scheduling in BFQ, using the cgroups

189

-+	  filesystem interface.  The name of the subsystem will be

190

-+	  bfqio.

191

-+

192

- choice

193

- 	prompt "Default I/O scheduler"

194

- 	default DEFAULT_CFQ

195

-@@ -52,6 +73,9 @@ choice

196

- 	config DEFAULT_CFQ

197

- 		bool "CFQ" if IOSCHED_CFQ=y

198

-

199

-+	config DEFAULT_BFQ

200

-+		bool "BFQ" if IOSCHED_BFQ=y

201

-+

202

- 	config DEFAULT_NOOP

203

- 		bool "No-op"

204

-

205

-@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED

206

- 	string

207

- 	default "deadline" if DEFAULT_DEADLINE

208

- 	default "cfq" if DEFAULT_CFQ

209

-+	default "bfq" if DEFAULT_BFQ

210

- 	default "noop" if DEFAULT_NOOP

211

-

212

- endmenu

213

-diff --git a/block/Makefile b/block/Makefile

214

-index 39b76ba..c0d20fa 100644

215

---- a/block/Makefile

216

-+++ b/block/Makefile

217

-@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

218

- obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

219

- obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

220

- obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

221

-+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

222

-

223

- obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

224

- obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

225

-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

226

-index b613ffd..43c5dc9 100644

227

---- a/include/linux/cgroup_subsys.h

228

-+++ b/include/linux/cgroup_subsys.h

229

-@@ -39,6 +39,10 @@ SUBSYS(net_cls)

230

- SUBSYS(blkio)

231

- #endif

232

-

233

-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

234

-+SUBSYS(bfqio)

235

-+#endif

236

-+

237

- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

238

- SUBSYS(perf)

239

- #endif

240

---

241

-1.8.1.4

242

-

243

244

Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

245

===================================================================

246

--- genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	2013-11-04 00:52:35 UTC (rev 2564)

247

+++ genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	2013-11-04 10:09:31 UTC (rev 2565)

248

@@ -1,5773 +0,0 @@

249

-From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001

250

-From: Arianna Avanzini <avanzini.arianna@×××××.com>

251

-Date: Thu, 9 May 2013 19:10:02 +0200

252

-Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11

253

-

254

-Add the BFQ-v6r2 I/O scheduler to 3.11.

255

-The general structure is borrowed from CFQ, as much code. A (bfq_)queue

256

-is associated to each task doing I/O on a device, and each time a

257

-scheduling decision has to be made a queue is selected and served until

258

-it expires.

259

-

260

-    - Slices are given in the service domain: tasks are assigned

261

-      budgets, measured in number of sectors. Once got the disk, a task

262

-      must however consume its assigned budget within a configurable

263

-      maximum time (by default, the maximum possible value of the

264

-      budgets is automatically computed to comply with this timeout).

265

-      This allows the desired latency vs "throughput boosting" tradeoff

266

-      to be set.

267

-

268

-    - Budgets are scheduled according to a variant of WF2Q+, implemented

269

-      using an augmented rb-tree to take eligibility into account while

270

-      preserving an O(log N) overall complexity.

271

-

272

-    - A low-latency tunable is provided; if enabled, both interactive

273

-      and soft real-time applications are guaranteed very low latency.

274

-

275

-    - Latency guarantees are preserved also in presence of NCQ.

276

-

277

-    - Also with flash-based devices, a high throughput is achieved while

278

-      still preserving latency guarantees.

279

-

280

-    - Useful features borrowed from CFQ: cooperating-queues merging (with

281

-      some additional optimizations with respect to the original CFQ version),

282

-      static fallback queue for OOM.

283

-

284

-    - BFQ supports full hierarchical scheduling, exporting a cgroups

285

-      interface.  Each node has a full scheduler, so each group can

286

-      be assigned its own ioprio (mapped to a weight, see next point)

287

-      and an ioprio_class.

288

-

289

-    - If the cgroups interface is used, weights can be explictly

290

-      assigned, otherwise ioprio values are mapped to weights using the

291

-      relation weight = IOPRIO_BE_NR - ioprio.

292

-

293

-    - ioprio classes are served in strict priority order, i.e., lower

294

-      priority queues are not served as long as there are higher

295

-      priority queues.  Among queues in the same class the bandwidth is

296

-      distributed in proportion to the weight of each queue. A very

297

-      thin extra bandwidth is however guaranteed to the Idle class, to

298

-      prevent it from starving.

299

-

300

-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

301

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

302

----

303

- block/bfq-cgroup.c  |  881 +++++++++++++++

304

- block/bfq-ioc.c     |   36 +

305

- block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++

306

- block/bfq-sched.c   | 1072 ++++++++++++++++++

307

- block/bfq.h         |  603 ++++++++++

308

- 5 files changed, 5674 insertions(+)

309

- create mode 100644 block/bfq-cgroup.c

310

- create mode 100644 block/bfq-ioc.c

311

- create mode 100644 block/bfq-iosched.c

312

- create mode 100644 block/bfq-sched.c

313

- create mode 100644 block/bfq.h

314

-

315

-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

316

-new file mode 100644

317

-index 0000000..bb9b851

318

---- /dev/null

319

-+++ b/block/bfq-cgroup.c

320

-@@ -0,0 +1,881 @@

321

-+/*

322

-+ * BFQ: CGROUPS support.

323

-+ *

324

-+ * Based on ideas and code from CFQ:

325

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

326

-+ *

327

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

328

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

329

-+ *

330

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

331

-+ *

332

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

333

-+ */

334

-+

335

-+#ifdef CONFIG_CGROUP_BFQIO

336

-+

337

-+static DEFINE_MUTEX(bfqio_mutex);

338

-+

339

-+static bool bfqio_is_removed(struct cgroup *cgroup)

340

-+{

341

-+	return test_bit(CGRP_DEAD, &cgroup->flags);

342

-+}

343

-+

344

-+static struct bfqio_cgroup bfqio_root_cgroup = {

345

-+	.weight = BFQ_DEFAULT_GRP_WEIGHT,

346

-+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

347

-+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

348

-+};

349

-+

350

-+static inline void bfq_init_entity(struct bfq_entity *entity,

351

-+				   struct bfq_group *bfqg)

352

-+{

353

-+	entity->weight = entity->new_weight;

354

-+	entity->orig_weight = entity->new_weight;

355

-+	entity->ioprio = entity->new_ioprio;

356

-+	entity->ioprio_class = entity->new_ioprio_class;

357

-+	entity->parent = bfqg->my_entity;

358

-+	entity->sched_data = &bfqg->sched_data;

359

-+}

360

-+

361

-+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)

362

-+{

363

-+	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),

364

-+			    struct bfqio_cgroup, css);

365

-+}

366

-+

367

-+/*

368

-+ * Search the bfq_group for bfqd into the hash table (by now only a list)

369

-+ * of bgrp.  Must be called under rcu_read_lock().

370

-+ */

371

-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

372

-+					    struct bfq_data *bfqd)

373

-+{

374

-+	struct bfq_group *bfqg;

375

-+	void *key;

376

-+

377

-+	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

378

-+		key = rcu_dereference(bfqg->bfqd);

379

-+		if (key == bfqd)

380

-+			return bfqg;

381

-+	}

382

-+

383

-+	return NULL;

384

-+}

385

-+

386

-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

387

-+					 struct bfq_group *bfqg)

388

-+{

389

-+	struct bfq_entity *entity = &bfqg->entity;

390

-+

391

-+	/*

392

-+	 * If the weight of the entity has never been set via the sysfs

393

-+	 * interface, then bgrp->weight == 0. In this case we initialize

394

-+	 * the weight from the current ioprio value. Otherwise, the group

395

-+	 * weight, if set, has priority over the ioprio value.

396

-+	 */

397

-+	if (bgrp->weight == 0) {

398

-+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

399

-+		entity->new_ioprio = bgrp->ioprio;

400

-+	} else {

401

-+		entity->new_weight = bgrp->weight;

402

-+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

403

-+	}

404

-+	entity->orig_weight = entity->weight = entity->new_weight;

405

-+	entity->ioprio = entity->new_ioprio;

406

-+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

407

-+	entity->my_sched_data = &bfqg->sched_data;

408

-+}

409

-+

410

-+static inline void bfq_group_set_parent(struct bfq_group *bfqg,

411

-+					struct bfq_group *parent)

412

-+{

413

-+	struct bfq_entity *entity;

414

-+

415

-+	BUG_ON(parent == NULL);

416

-+	BUG_ON(bfqg == NULL);

417

-+

418

-+	entity = &bfqg->entity;

419

-+	entity->parent = parent->my_entity;

420

-+	entity->sched_data = &parent->sched_data;

421

-+}

422

-+

423

-+/**

424

-+ * bfq_group_chain_alloc - allocate a chain of groups.

425

-+ * @bfqd: queue descriptor.

426

-+ * @cgroup: the leaf cgroup this chain starts from.

427

-+ *

428

-+ * Allocate a chain of groups starting from the one belonging to

429

-+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

430

-+ * to the root has already an allocated group on @bfqd.

431

-+ */

432

-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

433

-+					       struct cgroup *cgroup)

434

-+{

435

-+	struct bfqio_cgroup *bgrp;

436

-+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

437

-+

438

-+	for (; cgroup != NULL; cgroup = cgroup->parent) {

439

-+		bgrp = cgroup_to_bfqio(cgroup);

440

-+

441

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

442

-+		if (bfqg != NULL) {

443

-+			/*

444

-+			 * All the cgroups in the path from there to the

445

-+			 * root must have a bfq_group for bfqd, so we don't

446

-+			 * need any more allocations.

447

-+			 */

448

-+			break;

449

-+		}

450

-+

451

-+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

452

-+		if (bfqg == NULL)

453

-+			goto cleanup;

454

-+

455

-+		bfq_group_init_entity(bgrp, bfqg);

456

-+		bfqg->my_entity = &bfqg->entity;

457

-+

458

-+		if (leaf == NULL) {

459

-+			leaf = bfqg;

460

-+			prev = leaf;

461

-+		} else {

462

-+			bfq_group_set_parent(prev, bfqg);

463

-+			/*

464

-+			 * Build a list of allocated nodes using the bfqd

465

-+			 * filed, that is still unused and will be initialized

466

-+			 * only after the node will be connected.

467

-+			 */

468

-+			prev->bfqd = bfqg;

469

-+			prev = bfqg;

470

-+		}

471

-+	}

472

-+

473

-+	return leaf;

474

-+

475

-+cleanup:

476

-+	while (leaf != NULL) {

477

-+		prev = leaf;

478

-+		leaf = leaf->bfqd;

479

-+		kfree(prev);

480

-+	}

481

-+

482

-+	return NULL;

483

-+}

484

-+

485

-+/**

486

-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

487

-+ * @bfqd: the queue descriptor.

488

-+ * @cgroup: the leaf cgroup to start from.

489

-+ * @leaf: the leaf group (to be associated to @cgroup).

490

-+ *

491

-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the

492

-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the

493

-+ * hierarchy that already as a group associated to @bfqd all the nodes

494

-+ * in the path to the root cgroup have one too.

495

-+ *

496

-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy

497

-+ * per device) while the bfqio_cgroup lock protects the list of groups

498

-+ * belonging to the same cgroup.

499

-+ */

500

-+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

501

-+				 struct bfq_group *leaf)

502

-+{

503

-+	struct bfqio_cgroup *bgrp;

504

-+	struct bfq_group *bfqg, *next, *prev = NULL;

505

-+	unsigned long flags;

506

-+

507

-+	assert_spin_locked(bfqd->queue->queue_lock);

508

-+

509

-+	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {

510

-+		bgrp = cgroup_to_bfqio(cgroup);

511

-+		next = leaf->bfqd;

512

-+

513

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

514

-+		BUG_ON(bfqg != NULL);

515

-+

516

-+		spin_lock_irqsave(&bgrp->lock, flags);

517

-+

518

-+		rcu_assign_pointer(leaf->bfqd, bfqd);

519

-+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

520

-+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

521

-+

522

-+		spin_unlock_irqrestore(&bgrp->lock, flags);

523

-+

524

-+		prev = leaf;

525

-+		leaf = next;

526

-+	}

527

-+

528

-+	BUG_ON(cgroup == NULL && leaf != NULL);

529

-+	if (cgroup != NULL && prev != NULL) {

530

-+		bgrp = cgroup_to_bfqio(cgroup);

531

-+		bfqg = bfqio_lookup_group(bgrp, bfqd);

532

-+		bfq_group_set_parent(prev, bfqg);

533

-+	}

534

-+}

535

-+

536

-+/**

537

-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

538

-+ * @bfqd: queue descriptor.

539

-+ * @cgroup: cgroup being searched for.

540

-+ *

541

-+ * Return a group associated to @bfqd in @cgroup, allocating one if

542

-+ * necessary.  When a group is returned all the cgroups in the path

543

-+ * to the root have a group associated to @bfqd.

544

-+ *

545

-+ * If the allocation fails, return the root group: this breaks guarantees

546

-+ * but is a safe fallbak.  If this loss becames a problem it can be

547

-+ * mitigated using the equivalent weight (given by the product of the

548

-+ * weights of the groups in the path from @group to the root) in the

549

-+ * root scheduler.

550

-+ *

551

-+ * We allocate all the missing nodes in the path from the leaf cgroup

552

-+ * to the root and we connect the nodes only after all the allocations

553

-+ * have been successful.

554

-+ */

555

-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

556

-+					      struct cgroup *cgroup)

557

-+{

558

-+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

559

-+	struct bfq_group *bfqg;

560

-+

561

-+	bfqg = bfqio_lookup_group(bgrp, bfqd);

562

-+	if (bfqg != NULL)

563

-+		return bfqg;

564

-+

565

-+	bfqg = bfq_group_chain_alloc(bfqd, cgroup);

566

-+	if (bfqg != NULL)

567

-+		bfq_group_chain_link(bfqd, cgroup, bfqg);

568

-+	else

569

-+		bfqg = bfqd->root_group;

570

-+

571

-+	return bfqg;

572

-+}

573

-+

574

-+/**

575

-+ * bfq_bfqq_move - migrate @bfqq to @bfqg.

576

-+ * @bfqd: queue descriptor.

577

-+ * @bfqq: the queue to move.

578

-+ * @entity: @bfqq's entity.

579

-+ * @bfqg: the group to move to.

580

-+ *

581

-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

582

-+ * it on the new one.  Avoid putting the entity on the old group idle tree.

583

-+ *

584

-+ * Must be called under the queue lock; the cgroup owning @bfqg must

585

-+ * not disappear (by now this just means that we are called under

586

-+ * rcu_read_lock()).

587

-+ */

588

-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

589

-+			  struct bfq_entity *entity, struct bfq_group *bfqg)

590

-+{

591

-+	int busy, resume;

592

-+

593

-+	busy = bfq_bfqq_busy(bfqq);

594

-+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

595

-+

596

-+	BUG_ON(resume && !entity->on_st);

597

-+	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);

598

-+

599

-+	if (busy) {

600

-+		BUG_ON(atomic_read(&bfqq->ref) < 2);

601

-+

602

-+		if (!resume)

603

-+			bfq_del_bfqq_busy(bfqd, bfqq, 0);

604

-+		else

605

-+			bfq_deactivate_bfqq(bfqd, bfqq, 0);

606

-+	} else if (entity->on_st)

607

-+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

608

-+

609

-+	/*

610

-+	 * Here we use a reference to bfqg.  We don't need a refcounter

611

-+	 * as the cgroup reference will not be dropped, so that its

612

-+	 * destroy() callback will not be invoked.

613

-+	 */

614

-+	entity->parent = bfqg->my_entity;

615

-+	entity->sched_data = &bfqg->sched_data;

616

-+

617

-+	if (busy && resume)

618

-+		bfq_activate_bfqq(bfqd, bfqq);

619

-+

620

-+	if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)

621

-+		bfq_schedule_dispatch(bfqd);

622

-+}

623

-+

624

-+/**

625

-+ * __bfq_bic_change_cgroup - move @bic to @cgroup.

626

-+ * @bfqd: the queue descriptor.

627

-+ * @bic: the bic to move.

628

-+ * @cgroup: the cgroup to move to.

629

-+ *

630

-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

631

-+ * has to make sure that the reference to cgroup is valid across the call.

632

-+ *

633

-+ * NOTE: an alternative approach might have been to store the current

634

-+ * cgroup in bfqq and getting a reference to it, reducing the lookup

635

-+ * time here, at the price of slightly more complex code.

636

-+ */

637

-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

638

-+						 struct bfq_io_cq *bic,

639

-+						 struct cgroup *cgroup)

640

-+{

641

-+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

642

-+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

643

-+	struct bfq_entity *entity;

644

-+	struct bfq_group *bfqg;

645

-+	struct bfqio_cgroup *bgrp;

646

-+

647

-+	bgrp = cgroup_to_bfqio(cgroup);

648

-+

649

-+	bfqg = bfq_find_alloc_group(bfqd, cgroup);

650

-+	if (async_bfqq != NULL) {

651

-+		entity = &async_bfqq->entity;

652

-+

653

-+		if (entity->sched_data != &bfqg->sched_data) {

654

-+			bic_set_bfqq(bic, NULL, 0);

655

-+			bfq_log_bfqq(bfqd, async_bfqq,

656

-+				     "bic_change_group: %p %d",

657

-+				     async_bfqq, atomic_read(&async_bfqq->ref));

658

-+			bfq_put_queue(async_bfqq);

659

-+		}

660

-+	}

661

-+

662

-+	if (sync_bfqq != NULL) {

663

-+		entity = &sync_bfqq->entity;

664

-+		if (entity->sched_data != &bfqg->sched_data)

665

-+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

666

-+	}

667

-+

668

-+	return bfqg;

669

-+}

670

-+

671

-+/**

672

-+ * bfq_bic_change_cgroup - move @bic to @cgroup.

673

-+ * @bic: the bic being migrated.

674

-+ * @cgroup: the destination cgroup.

675

-+ *

676

-+ * When the task owning @bic is moved to @cgroup, @bic is immediately

677

-+ * moved into its new parent group.

678

-+ */

679

-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

680

-+				  struct cgroup *cgroup)

681

-+{

682

-+	struct bfq_data *bfqd;

683

-+	unsigned long uninitialized_var(flags);

684

-+

685

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

686

-+	if (bfqd != NULL) {

687

-+		__bfq_bic_change_cgroup(bfqd, bic, cgroup);

688

-+		bfq_put_bfqd_unlock(bfqd, &flags);

689

-+	}

690

-+}

691

-+

692

-+/**

693

-+ * bfq_bic_update_cgroup - update the cgroup of @bic.

694

-+ * @bic: the @bic to update.

695

-+ *

696

-+ * Make sure that @bic is enqueued in the cgroup of the current task.

697

-+ * We need this in addition to moving bics during the cgroup attach

698

-+ * phase because the task owning @bic could be at its first disk

699

-+ * access or we may end up in the root cgroup as the result of a

700

-+ * memory allocation failure and here we try to move to the right

701

-+ * group.

702

-+ *

703

-+ * Must be called under the queue lock.  It is safe to use the returned

704

-+ * value even after the rcu_read_unlock() as the migration/destruction

705

-+ * paths act under the queue lock too.  IOW it is impossible to race with

706

-+ * group migration/destruction and end up with an invalid group as:

707

-+ *   a) here cgroup has not yet been destroyed, nor its destroy callback

708

-+ *      has started execution, as current holds a reference to it,

709

-+ *   b) if it is destroyed after rcu_read_unlock() [after current is

710

-+ *      migrated to a different cgroup] its attach() callback will have

711

-+ *      taken care of remove all the references to the old cgroup data.

712

-+ */

713

-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

714

-+{

715

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

716

-+	struct bfq_group *bfqg;

717

-+	struct cgroup *cgroup;

718

-+

719

-+	BUG_ON(bfqd == NULL);

720

-+

721

-+	rcu_read_lock();

722

-+	cgroup = task_cgroup(current, bfqio_subsys_id);

723

-+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);

724

-+	rcu_read_unlock();

725

-+

726

-+	return bfqg;

727

-+}

728

-+

729

-+/**

730

-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

731

-+ * @st: the service tree being flushed.

732

-+ */

733

-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

734

-+{

735

-+	struct bfq_entity *entity = st->first_idle;

736

-+

737

-+	for (; entity != NULL; entity = st->first_idle)

738

-+		__bfq_deactivate_entity(entity, 0);

739

-+}

740

-+

741

-+/**

742

-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

743

-+ * @bfqd: the device data structure with the root group.

744

-+ * @entity: the entity to move.

745

-+ */

746

-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

747

-+					    struct bfq_entity *entity)

748

-+{

749

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

750

-+

751

-+	BUG_ON(bfqq == NULL);

752

-+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

753

-+	return;

754

-+}

755

-+

756

-+/**

757

-+ * bfq_reparent_active_entities - move to the root group all active entities.

758

-+ * @bfqd: the device data structure with the root group.

759

-+ * @bfqg: the group to move from.

760

-+ * @st: the service tree with the entities.

761

-+ *

762

-+ * Needs queue_lock to be taken and reference to be valid over the call.

763

-+ */

764

-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

765

-+						struct bfq_group *bfqg,

766

-+						struct bfq_service_tree *st)

767

-+{

768

-+	struct rb_root *active = &st->active;

769

-+	struct bfq_entity *entity = NULL;

770

-+

771

-+	if (!RB_EMPTY_ROOT(&st->active))

772

-+		entity = bfq_entity_of(rb_first(active));

773

-+

774

-+	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))

775

-+		bfq_reparent_leaf_entity(bfqd, entity);

776

-+

777

-+	if (bfqg->sched_data.active_entity != NULL)

778

-+		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

779

-+

780

-+	return;

781

-+}

782

-+

783

-+/**

784

-+ * bfq_destroy_group - destroy @bfqg.

785

-+ * @bgrp: the bfqio_cgroup containing @bfqg.

786

-+ * @bfqg: the group being destroyed.

787

-+ *

788

-+ * Destroy @bfqg, making sure that it is not referenced from its parent.

789

-+ */

790

-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

791

-+{

792

-+	struct bfq_data *bfqd;

793

-+	struct bfq_service_tree *st;

794

-+	struct bfq_entity *entity = bfqg->my_entity;

795

-+	unsigned long uninitialized_var(flags);

796

-+	int i;

797

-+

798

-+	hlist_del(&bfqg->group_node);

799

-+

800

-+	/*

801

-+	 * Empty all service_trees belonging to this group before deactivating

802

-+	 * the group itself.

803

-+	 */

804

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

805

-+		st = bfqg->sched_data.service_tree + i;

806

-+

807

-+		/*

808

-+		 * The idle tree may still contain bfq_queues belonging

809

-+		 * to exited task because they never migrated to a different

810

-+		 * cgroup from the one being destroyed now.  Noone else

811

-+		 * can access them so it's safe to act without any lock.

812

-+		 */

813

-+		bfq_flush_idle_tree(st);

814

-+

815

-+		/*

816

-+		 * It may happen that some queues are still active

817

-+		 * (busy) upon group destruction (if the corresponding

818

-+		 * processes have been forced to terminate). We move

819

-+		 * all the leaf entities corresponding to these queues

820

-+		 * to the root_group.

821

-+		 * Also, it may happen that the group has an entity

822

-+		 * under service, which is disconnected from the active

823

-+		 * tree: it must be moved, too.

824

-+		 * There is no need to put the sync queues, as the

825

-+		 * scheduler has taken no reference.

826

-+		 */

827

-+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

828

-+		if (bfqd != NULL) {

829

-+			bfq_reparent_active_entities(bfqd, bfqg, st);

830

-+			bfq_put_bfqd_unlock(bfqd, &flags);

831

-+		}

832

-+		BUG_ON(!RB_EMPTY_ROOT(&st->active));

833

-+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

834

-+	}

835

-+	BUG_ON(bfqg->sched_data.next_active != NULL);

836

-+	BUG_ON(bfqg->sched_data.active_entity != NULL);

837

-+

838

-+	/*

839

-+	 * We may race with device destruction, take extra care when

840

-+	 * dereferencing bfqg->bfqd.

841

-+	 */

842

-+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

843

-+	if (bfqd != NULL) {

844

-+		hlist_del(&bfqg->bfqd_node);

845

-+		__bfq_deactivate_entity(entity, 0);

846

-+		bfq_put_async_queues(bfqd, bfqg);

847

-+		bfq_put_bfqd_unlock(bfqd, &flags);

848

-+	}

849

-+	BUG_ON(entity->tree != NULL);

850

-+

851

-+	/*

852

-+	 * No need to defer the kfree() to the end of the RCU grace

853

-+	 * period: we are called from the destroy() callback of our

854

-+	 * cgroup, so we can be sure that noone is a) still using

855

-+	 * this cgroup or b) doing lookups in it.

856

-+	 */

857

-+	kfree(bfqg);

858

-+}

859

-+

860

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

861

-+{

862

-+	struct hlist_node *tmp;

863

-+	struct bfq_group *bfqg;

864

-+

865

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

866

-+		bfq_end_raising_async_queues(bfqd, bfqg);

867

-+}

868

-+

869

-+/**

870

-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

871

-+ * @bfqd: the device descriptor being exited.

872

-+ *

873

-+ * When the device exits we just make sure that no lookup can return

874

-+ * the now unused group structures.  They will be deallocated on cgroup

875

-+ * destruction.

876

-+ */

877

-+static void bfq_disconnect_groups(struct bfq_data *bfqd)

878

-+{

879

-+	struct hlist_node *tmp;

880

-+	struct bfq_group *bfqg;

881

-+

882

-+	bfq_log(bfqd, "disconnect_groups beginning") ;

883

-+	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

884

-+		hlist_del(&bfqg->bfqd_node);

885

-+

886

-+		__bfq_deactivate_entity(bfqg->my_entity, 0);

887

-+

888

-+		/*

889

-+		 * Don't remove from the group hash, just set an

890

-+		 * invalid key.  No lookups can race with the

891

-+		 * assignment as bfqd is being destroyed; this

892

-+		 * implies also that new elements cannot be added

893

-+		 * to the list.

894

-+		 */

895

-+		rcu_assign_pointer(bfqg->bfqd, NULL);

896

-+

897

-+		bfq_log(bfqd, "disconnect_groups: put async for group %p",

898

-+			bfqg) ;

899

-+		bfq_put_async_queues(bfqd, bfqg);

900

-+	}

901

-+}

902

-+

903

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

904

-+{

905

-+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

906

-+	struct bfq_group *bfqg = bfqd->root_group;

907

-+

908

-+	bfq_put_async_queues(bfqd, bfqg);

909

-+

910

-+	spin_lock_irq(&bgrp->lock);

911

-+	hlist_del_rcu(&bfqg->group_node);

912

-+	spin_unlock_irq(&bgrp->lock);

913

-+

914

-+	/*

915

-+	 * No need to synchronize_rcu() here: since the device is gone

916

-+	 * there cannot be any read-side access to its root_group.

917

-+	 */

918

-+	kfree(bfqg);

919

-+}

920

-+

921

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

922

-+{

923

-+	struct bfq_group *bfqg;

924

-+	struct bfqio_cgroup *bgrp;

925

-+	int i;

926

-+

927

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

928

-+	if (bfqg == NULL)

929

-+		return NULL;

930

-+

931

-+	bfqg->entity.parent = NULL;

932

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

933

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

934

-+

935

-+	bgrp = &bfqio_root_cgroup;

936

-+	spin_lock_irq(&bgrp->lock);

937

-+	rcu_assign_pointer(bfqg->bfqd, bfqd);

938

-+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

939

-+	spin_unlock_irq(&bgrp->lock);

940

-+

941

-+	return bfqg;

942

-+}

943

-+

944

-+#define SHOW_FUNCTION(__VAR)						\

945

-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\

946

-+				       struct cftype *cftype)		\

947

-+{									\

948

-+	struct bfqio_cgroup *bgrp;					\

949

-+	u64 ret = -ENODEV;						\

950

-+									\

951

-+	mutex_lock(&bfqio_mutex);					\

952

-+	if (bfqio_is_removed(cgroup))					\

953

-+		goto out_unlock;					\

954

-+									\

955

-+	bgrp = cgroup_to_bfqio(cgroup);					\

956

-+	spin_lock_irq(&bgrp->lock);					\

957

-+	ret = bgrp->__VAR;						\

958

-+	spin_unlock_irq(&bgrp->lock);					\

959

-+									\

960

-+out_unlock:								\

961

-+	mutex_unlock(&bfqio_mutex);					\

962

-+	return ret;							\

963

-+}

964

-+

965

-+SHOW_FUNCTION(weight);

966

-+SHOW_FUNCTION(ioprio);

967

-+SHOW_FUNCTION(ioprio_class);

968

-+#undef SHOW_FUNCTION

969

-+

970

-+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

971

-+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

972

-+					struct cftype *cftype,		\

973

-+					u64 val)			\

974

-+{									\

975

-+	struct bfqio_cgroup *bgrp;					\

976

-+	struct bfq_group *bfqg;						\

977

-+	int ret = -EINVAL;						\

978

-+									\

979

-+	if (val < (__MIN) || val > (__MAX))				\

980

-+		return ret;						\

981

-+									\

982

-+	ret = -ENODEV;							\

983

-+	mutex_lock(&bfqio_mutex);					\

984

-+	if (bfqio_is_removed(cgroup))					\

985

-+		goto out_unlock;					\

986

-+	ret = 0;							\

987

-+									\

988

-+	bgrp = cgroup_to_bfqio(cgroup);					\

989

-+									\

990

-+	spin_lock_irq(&bgrp->lock);					\

991

-+	bgrp->__VAR = (unsigned short)val;				\

992

-+	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

993

-+		/*							\

994

-+                 * Setting the ioprio_changed flag of the entity        \

995

-+                 * to 1 with new_##__VAR == ##__VAR would re-set        \

996

-+                 * the value of the weight to its ioprio mapping.       \

997

-+                 * Set the flag only if necessary.                      \

998

-+                 */                                                     \

999

-+                if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

1000

-+                        bfqg->entity.new_##__VAR = (unsigned short)val; \

1001

-+                        smp_wmb();                                      \

1002

-+                        bfqg->entity.ioprio_changed = 1;                \

1003

-+                }							\

1004

-+	}								\

1005

-+	spin_unlock_irq(&bgrp->lock);					\

1006

-+									\

1007

-+out_unlock:								\

1008

-+	mutex_unlock(&bfqio_mutex);					\

1009

-+	return ret;							\

1010

-+}

1011

-+

1012

-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

1013

-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

1014

-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

1015

-+#undef STORE_FUNCTION

1016

-+

1017

-+static struct cftype bfqio_files[] = {

1018

-+	{

1019

-+		.name = "weight",

1020

-+		.read_u64 = bfqio_cgroup_weight_read,

1021

-+		.write_u64 = bfqio_cgroup_weight_write,

1022

-+	},

1023

-+	{

1024

-+		.name = "ioprio",

1025

-+		.read_u64 = bfqio_cgroup_ioprio_read,

1026

-+		.write_u64 = bfqio_cgroup_ioprio_write,

1027

-+	},

1028

-+	{

1029

-+		.name = "ioprio_class",

1030

-+		.read_u64 = bfqio_cgroup_ioprio_class_read,

1031

-+		.write_u64 = bfqio_cgroup_ioprio_class_write,

1032

-+	},

1033

-+	{ },	/* terminate */

1034

-+};

1035

-+

1036

-+static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

1037

-+{

1038

-+	struct bfqio_cgroup *bgrp;

1039

-+

1040

-+	if (cgroup->parent != NULL) {

1041

-+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

1042

-+		if (bgrp == NULL)

1043

-+			return ERR_PTR(-ENOMEM);

1044

-+	} else

1045

-+		bgrp = &bfqio_root_cgroup;

1046

-+

1047

-+	spin_lock_init(&bgrp->lock);

1048

-+	INIT_HLIST_HEAD(&bgrp->group_data);

1049

-+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

1050

-+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

1051

-+

1052

-+	return &bgrp->css;

1053

-+}

1054

-+

1055

-+/*

1056

-+ * We cannot support shared io contexts, as we have no means to support

1057

-+ * two tasks with the same ioc in two different groups without major rework

1058

-+ * of the main bic/bfqq data structures.  By now we allow a task to change

1059

-+ * its cgroup only if it's the only owner of its ioc; the drawback of this

1060

-+ * behavior is that a group containing a task that forked using CLONE_IO

1061

-+ * will not be destroyed until the tasks sharing the ioc die.

1062

-+ */

1063

-+static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

1064

-+{

1065

-+	struct task_struct *task;

1066

-+	struct io_context *ioc;

1067

-+	int ret = 0;

1068

-+

1069

-+	cgroup_taskset_for_each(task, cgroup, tset) {

1070

-+		/* task_lock() is needed to avoid races with exit_io_context() */

1071

-+		task_lock(task);

1072

-+		ioc = task->io_context;

1073

-+		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1074

-+			/*

1075

-+			 * ioc == NULL means that the task is either too young or

1076

-+			 * exiting: if it has still no ioc the ioc can't be shared,

1077

-+			 * if the task is exiting the attach will fail anyway, no

1078

-+			 * matter what we return here.

1079

-+			 */

1080

-+			ret = -EINVAL;

1081

-+		task_unlock(task);

1082

-+		if (ret)

1083

-+			break;

1084

-+	}

1085

-+

1086

-+	return ret;

1087

-+}

1088

-+

1089

-+static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

1090

-+{

1091

-+	struct task_struct *task;

1092

-+	struct io_context *ioc;

1093

-+	struct io_cq *icq;

1094

-+

1095

-+	/*

1096

-+	 * IMPORTANT NOTE: The move of more than one process at a time to a

1097

-+	 * new group has not yet been tested.

1098

-+	 */

1099

-+	cgroup_taskset_for_each(task, cgroup, tset) {

1100

-+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1101

-+		if (ioc) {

1102

-+			/*

1103

-+			 * Handle cgroup change here.

1104

-+			 */

1105

-+			rcu_read_lock();

1106

-+			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1107

-+				if (!strncmp(icq->q->elevator->type->elevator_name,

1108

-+					     "bfq", ELV_NAME_MAX))

1109

-+					bfq_bic_change_cgroup(icq_to_bic(icq),

1110

-+							      cgroup);

1111

-+			rcu_read_unlock();

1112

-+			put_io_context(ioc);

1113

-+		}

1114

-+	}

1115

-+}

1116

-+

1117

-+static void bfqio_destroy(struct cgroup *cgroup)

1118

-+{

1119

-+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

1120

-+	struct hlist_node *tmp;

1121

-+	struct bfq_group *bfqg;

1122

-+

1123

-+	/*

1124

-+	 * Since we are destroying the cgroup, there are no more tasks

1125

-+	 * referencing it, and all the RCU grace periods that may have

1126

-+	 * referenced it are ended (as the destruction of the parent

1127

-+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1128

-+	 * anything else and we don't need any synchronization.

1129

-+	 */

1130

-+	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1131

-+		bfq_destroy_group(bgrp, bfqg);

1132

-+

1133

-+	BUG_ON(!hlist_empty(&bgrp->group_data));

1134

-+

1135

-+	kfree(bgrp);

1136

-+}

1137

-+

1138

-+struct cgroup_subsys bfqio_subsys = {

1139

-+	.name = "bfqio",

1140

-+	.css_alloc = bfqio_create,

1141

-+	.can_attach = bfqio_can_attach,

1142

-+	.attach = bfqio_attach,

1143

-+	.css_free = bfqio_destroy,

1144

-+	.subsys_id = bfqio_subsys_id,

1145

-+	.base_cftypes = bfqio_files,

1146

-+};

1147

-+#else

1148

-+static inline void bfq_init_entity(struct bfq_entity *entity,

1149

-+				   struct bfq_group *bfqg)

1150

-+{

1151

-+	entity->weight = entity->new_weight;

1152

-+	entity->orig_weight = entity->new_weight;

1153

-+	entity->ioprio = entity->new_ioprio;

1154

-+	entity->ioprio_class = entity->new_ioprio_class;

1155

-+	entity->sched_data = &bfqg->sched_data;

1156

-+}

1157

-+

1158

-+static inline struct bfq_group *

1159

-+bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1160

-+{

1161

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

1162

-+	return bfqd->root_group;

1163

-+}

1164

-+

1165

-+static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1166

-+				 struct bfq_queue *bfqq,

1167

-+				 struct bfq_entity *entity,

1168

-+				 struct bfq_group *bfqg)

1169

-+{

1170

-+}

1171

-+

1172

-+static void bfq_end_raising_async(struct bfq_data *bfqd)

1173

-+{

1174

-+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

1175

-+}

1176

-+

1177

-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1178

-+{

1179

-+	bfq_put_async_queues(bfqd, bfqd->root_group);

1180

-+}

1181

-+

1182

-+static inline void bfq_free_root_group(struct bfq_data *bfqd)

1183

-+{

1184

-+	kfree(bfqd->root_group);

1185

-+}

1186

-+

1187

-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1188

-+{

1189

-+	struct bfq_group *bfqg;

1190

-+	int i;

1191

-+

1192

-+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1193

-+	if (bfqg == NULL)

1194

-+		return NULL;

1195

-+

1196

-+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1197

-+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1198

-+

1199

-+	return bfqg;

1200

-+}

1201

-+#endif

1202

-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1203

-new file mode 100644

1204

-index 0000000..326e3ec

1205

---- /dev/null

1206

-+++ b/block/bfq-ioc.c

1207

-@@ -0,0 +1,36 @@

1208

-+/*

1209

-+ * BFQ: I/O context handling.

1210

-+ *

1211

-+ * Based on ideas and code from CFQ:

1212

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1213

-+ *

1214

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1215

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1216

-+ *

1217

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1218

-+ */

1219

-+

1220

-+/**

1221

-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1222

-+ * @icq: the iocontext queue.

1223

-+ */

1224

-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1225

-+{

1226

-+	/* bic->icq is the first member, %NULL will convert to %NULL */

1227

-+	return container_of(icq, struct bfq_io_cq, icq);

1228

-+}

1229

-+

1230

-+/**

1231

-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1232

-+ * @bfqd: the lookup key.

1233

-+ * @ioc: the io_context of the process doing I/O.

1234

-+ *

1235

-+ * Queue lock must be held.

1236

-+ */

1237

-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1238

-+					       struct io_context *ioc)

1239

-+{

1240

-+	if(ioc)

1241

-+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1242

-+	return NULL;

1243

-+}

1244

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1245

-new file mode 100644

1246

-index 0000000..0ed2746

1247

---- /dev/null

1248

-+++ b/block/bfq-iosched.c

1249

-@@ -0,0 +1,3082 @@

1250

-+/*

1251

-+ * BFQ, or Budget Fair Queueing, disk scheduler.

1252

-+ *

1253

-+ * Based on ideas and code from CFQ:

1254

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1255

-+ *

1256

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1257

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

1258

-+ *

1259

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1260

-+ *

1261

-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

1262

-+ *

1263

-+ * BFQ is a proportional share disk scheduling algorithm based on the

1264

-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1265

-+ * measured in number of sectors, to tasks instead of time slices.

1266

-+ * The disk is not granted to the active task for a given time slice,

1267

-+ * but until it has exahusted its assigned budget.  This change from

1268

-+ * the time to the service domain allows BFQ to distribute the disk

1269

-+ * bandwidth among tasks as desired, without any distortion due to

1270

-+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc

1271

-+ * internal scheduler, called B-WF2Q+, to schedule tasks according to

1272

-+ * their budgets.  Thanks to this accurate scheduler, BFQ can afford

1273

-+ * to assign high budgets to disk-bound non-seeky tasks (to boost the

1274

-+ * throughput), and yet guarantee low latencies to interactive and

1275

-+ * soft real-time applications.

1276

-+ *

1277

-+ * BFQ has been introduced in [1], where the interested reader can

1278

-+ * find an accurate description of the algorithm, the bandwidth

1279

-+ * distribution and latency guarantees it provides, plus formal proofs

1280

-+ * of all the properties.  With respect to the algorithm presented in

1281

-+ * the paper, this implementation adds several little heuristics, and

1282

-+ * a hierarchical extension, based on H-WF2Q+.

1283

-+ *

1284

-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1285

-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1286

-+ * complexity derives from the one introduced with EEVDF in [3].

1287

-+ *

1288

-+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling

1289

-+ *     with Deterministic Guarantees on Bandwidth Distribution,'',

1290

-+ *     IEEE Transactions on Computer, May 2010.

1291

-+ *

1292

-+ *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf

1293

-+ *

1294

-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1295

-+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1296

-+ *     Oct 1997.

1297

-+ *

1298

-+ *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1299

-+ *

1300

-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1301

-+ *     First: A Flexible and Accurate Mechanism for Proportional Share

1302

-+ *     Resource Allocation,'' technical report.

1303

-+ *

1304

-+ *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1305

-+ */

1306

-+#include <linux/module.h>

1307

-+#include <linux/slab.h>

1308

-+#include <linux/blkdev.h>

1309

-+#include <linux/cgroup.h>

1310

-+#include <linux/elevator.h>

1311

-+#include <linux/jiffies.h>

1312

-+#include <linux/rbtree.h>

1313

-+#include <linux/ioprio.h>

1314

-+#include "bfq.h"

1315

-+#include "blk.h"

1316

-+

1317

-+/* Max number of dispatches in one round of service. */

1318

-+static const int bfq_quantum = 4;

1319

-+

1320

-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1321

-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1322

-+

1323

-+/* Maximum backwards seek, in KiB. */

1324

-+static const int bfq_back_max = 16 * 1024;

1325

-+

1326

-+/* Penalty of a backwards seek, in number of sectors. */

1327

-+static const int bfq_back_penalty = 2;

1328

-+

1329

-+/* Idling period duration, in jiffies. */

1330

-+static int bfq_slice_idle = HZ / 125;

1331

-+

1332

-+/* Default maximum budget values, in sectors and number of requests. */

1333

-+static const int bfq_default_max_budget = 16 * 1024;

1334

-+static const int bfq_max_budget_async_rq = 4;

1335

-+

1336

-+/*

1337

-+ * Async to sync throughput distribution is controlled as follows:

1338

-+ * when an async request is served, the entity is charged the number

1339

-+ * of sectors of the request, multipled by the factor below

1340

-+ */

1341

-+static const int bfq_async_charge_factor = 10;

1342

-+

1343

-+/* Default timeout values, in jiffies, approximating CFQ defaults. */

1344

-+static const int bfq_timeout_sync = HZ / 8;

1345

-+static int bfq_timeout_async = HZ / 25;

1346

-+

1347

-+struct kmem_cache *bfq_pool;

1348

-+

1349

-+/* Below this threshold (in ms), we consider thinktime immediate. */

1350

-+#define BFQ_MIN_TT		2

1351

-+

1352

-+/* hw_tag detection: parallel requests threshold and min samples needed. */

1353

-+#define BFQ_HW_QUEUE_THRESHOLD	4

1354

-+#define BFQ_HW_QUEUE_SAMPLES	32

1355

-+

1356

-+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1357

-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1358

-+

1359

-+/* Min samples used for peak rate estimation (for autotuning). */

1360

-+#define BFQ_PEAK_RATE_SAMPLES	32

1361

-+

1362

-+/* Shift used for peak rate fixed precision calculations. */

1363

-+#define BFQ_RATE_SHIFT		16

1364

-+

1365

-+/*

1366

-+ * The duration of the weight raising for interactive applications is

1367

-+ * computed automatically (as default behaviour), using the following

1368

-+ * formula: duration = (R / r) * T, where r is the peak rate of the

1369

-+ * disk, and R and T are two reference parameters. In particular, R is

1370

-+ * the peak rate of a reference disk, and T is about the maximum time

1371

-+ * for starting popular large applications on that disk, under BFQ and

1372

-+ * while reading two files in parallel. Finally, BFQ uses two

1373

-+ * different pairs (R, T) depending on whether the disk is rotational

1374

-+ * or non-rotational.

1375

-+ */

1376

-+#define T_rot			(msecs_to_jiffies(5500))

1377

-+#define T_nonrot		(msecs_to_jiffies(2000))

1378

-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

1379

-+#define R_rot			17415

1380

-+#define R_nonrot		34791

1381

-+

1382

-+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1383

-+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1384

-+

1385

-+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1386

-+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1387

-+

1388

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1389

-+

1390

-+#include "bfq-ioc.c"

1391

-+#include "bfq-sched.c"

1392

-+#include "bfq-cgroup.c"

1393

-+

1394

-+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1395

-+				 IOPRIO_CLASS_IDLE)

1396

-+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1397

-+				 IOPRIO_CLASS_RT)

1398

-+

1399

-+#define bfq_sample_valid(samples)	((samples) > 80)

1400

-+

1401

-+/*

1402

-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1403

-+ * set (in which case it could also be a direct WRITE).

1404

-+ */

1405

-+static inline int bfq_bio_sync(struct bio *bio)

1406

-+{

1407

-+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1408

-+		return 1;

1409

-+

1410

-+	return 0;

1411

-+}

1412

-+

1413

-+/*

1414

-+ * Scheduler run of queue, if there are requests pending and no one in the

1415

-+ * driver that will restart queueing.

1416

-+ */

1417

-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1418

-+{

1419

-+	if (bfqd->queued != 0) {

1420

-+		bfq_log(bfqd, "schedule dispatch");

1421

-+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

1422

-+	}

1423

-+}

1424

-+

1425

-+/*

1426

-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1427

-+ * We choose the request that is closesr to the head right now.  Distance

1428

-+ * behind the head is penalized and only allowed to a certain extent.

1429

-+ */

1430

-+static struct request *bfq_choose_req(struct bfq_data *bfqd,

1431

-+				      struct request *rq1,

1432

-+				      struct request *rq2,

1433

-+				      sector_t last)

1434

-+{

1435

-+	sector_t s1, s2, d1 = 0, d2 = 0;

1436

-+	unsigned long back_max;

1437

-+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1438

-+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1439

-+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1440

-+

1441

-+	if (rq1 == NULL || rq1 == rq2)

1442

-+		return rq2;

1443

-+	if (rq2 == NULL)

1444

-+		return rq1;

1445

-+

1446

-+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1447

-+		return rq1;

1448

-+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1449

-+		return rq2;

1450

-+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1451

-+		return rq1;

1452

-+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1453

-+		return rq2;

1454

-+

1455

-+	s1 = blk_rq_pos(rq1);

1456

-+	s2 = blk_rq_pos(rq2);

1457

-+

1458

-+	/*

1459

-+	 * By definition, 1KiB is 2 sectors.

1460

-+	 */

1461

-+	back_max = bfqd->bfq_back_max * 2;

1462

-+

1463

-+	/*

1464

-+	 * Strict one way elevator _except_ in the case where we allow

1465

-+	 * short backward seeks which are biased as twice the cost of a

1466

-+	 * similar forward seek.

1467

-+	 */

1468

-+	if (s1 >= last)

1469

-+		d1 = s1 - last;

1470

-+	else if (s1 + back_max >= last)

1471

-+		d1 = (last - s1) * bfqd->bfq_back_penalty;

1472

-+	else

1473

-+		wrap |= BFQ_RQ1_WRAP;

1474

-+

1475

-+	if (s2 >= last)

1476

-+		d2 = s2 - last;

1477

-+	else if (s2 + back_max >= last)

1478

-+		d2 = (last - s2) * bfqd->bfq_back_penalty;

1479

-+	else

1480

-+		wrap |= BFQ_RQ2_WRAP;

1481

-+

1482

-+	/* Found required data */

1483

-+

1484

-+	/*

1485

-+	 * By doing switch() on the bit mask "wrap" we avoid having to

1486

-+	 * check two variables for all permutations: --> faster!

1487

-+	 */

1488

-+	switch (wrap) {

1489

-+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1490

-+		if (d1 < d2)

1491

-+			return rq1;

1492

-+		else if (d2 < d1)

1493

-+			return rq2;

1494

-+		else {

1495

-+			if (s1 >= s2)

1496

-+				return rq1;

1497

-+			else

1498

-+				return rq2;

1499

-+		}

1500

-+

1501

-+	case BFQ_RQ2_WRAP:

1502

-+		return rq1;

1503

-+	case BFQ_RQ1_WRAP:

1504

-+		return rq2;

1505

-+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1506

-+	default:

1507

-+		/*

1508

-+		 * Since both rqs are wrapped,

1509

-+		 * start with the one that's further behind head

1510

-+		 * (--> only *one* back seek required),

1511

-+		 * since back seek takes more time than forward.

1512

-+		 */

1513

-+		if (s1 <= s2)

1514

-+			return rq1;

1515

-+		else

1516

-+			return rq2;

1517

-+	}

1518

-+}

1519

-+

1520

-+static struct bfq_queue *

1521

-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1522

-+		     sector_t sector, struct rb_node **ret_parent,

1523

-+		     struct rb_node ***rb_link)

1524

-+{

1525

-+	struct rb_node **p, *parent;

1526

-+	struct bfq_queue *bfqq = NULL;

1527

-+

1528

-+	parent = NULL;

1529

-+	p = &root->rb_node;

1530

-+	while (*p) {

1531

-+		struct rb_node **n;

1532

-+

1533

-+		parent = *p;

1534

-+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1535

-+

1536

-+		/*

1537

-+		 * Sort strictly based on sector. Smallest to the left,

1538

-+		 * largest to the right.

1539

-+		 */

1540

-+		if (sector > blk_rq_pos(bfqq->next_rq))

1541

-+			n = &(*p)->rb_right;

1542

-+		else if (sector < blk_rq_pos(bfqq->next_rq))

1543

-+			n = &(*p)->rb_left;

1544

-+		else

1545

-+			break;

1546

-+		p = n;

1547

-+		bfqq = NULL;

1548

-+	}

1549

-+

1550

-+	*ret_parent = parent;

1551

-+	if (rb_link)

1552

-+		*rb_link = p;

1553

-+

1554

-+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1555

-+		(long long unsigned)sector,

1556

-+		bfqq != NULL ? bfqq->pid : 0);

1557

-+

1558

-+	return bfqq;

1559

-+}

1560

-+

1561

-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1562

-+{

1563

-+	struct rb_node **p, *parent;

1564

-+	struct bfq_queue *__bfqq;

1565

-+

1566

-+	if (bfqq->pos_root != NULL) {

1567

-+		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1568

-+		bfqq->pos_root = NULL;

1569

-+	}

1570

-+

1571

-+	if (bfq_class_idle(bfqq))

1572

-+		return;

1573

-+	if (!bfqq->next_rq)

1574

-+		return;

1575

-+

1576

-+	bfqq->pos_root = &bfqd->rq_pos_tree;

1577

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1578

-+			blk_rq_pos(bfqq->next_rq), &parent, &p);

1579

-+	if (__bfqq == NULL) {

1580

-+		rb_link_node(&bfqq->pos_node, parent, p);

1581

-+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1582

-+	} else

1583

-+		bfqq->pos_root = NULL;

1584

-+}

1585

-+

1586

-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1587

-+					struct bfq_queue *bfqq,

1588

-+					struct request *last)

1589

-+{

1590

-+	struct rb_node *rbnext = rb_next(&last->rb_node);

1591

-+	struct rb_node *rbprev = rb_prev(&last->rb_node);

1592

-+	struct request *next = NULL, *prev = NULL;

1593

-+

1594

-+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1595

-+

1596

-+	if (rbprev != NULL)

1597

-+		prev = rb_entry_rq(rbprev);

1598

-+

1599

-+	if (rbnext != NULL)

1600

-+		next = rb_entry_rq(rbnext);

1601

-+	else {

1602

-+		rbnext = rb_first(&bfqq->sort_list);

1603

-+		if (rbnext && rbnext != &last->rb_node)

1604

-+			next = rb_entry_rq(rbnext);

1605

-+	}

1606

-+

1607

-+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1608

-+}

1609

-+

1610

-+static void bfq_del_rq_rb(struct request *rq)

1611

-+{

1612

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1613

-+	struct bfq_data *bfqd = bfqq->bfqd;

1614

-+	const int sync = rq_is_sync(rq);

1615

-+

1616

-+	BUG_ON(bfqq->queued[sync] == 0);

1617

-+	bfqq->queued[sync]--;

1618

-+	bfqd->queued--;

1619

-+

1620

-+	elv_rb_del(&bfqq->sort_list, rq);

1621

-+

1622

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

1623

-+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)

1624

-+			bfq_del_bfqq_busy(bfqd, bfqq, 1);

1625

-+		/*

1626

-+		 * Remove queue from request-position tree as it is empty.

1627

-+		 */

1628

-+		if (bfqq->pos_root != NULL) {

1629

-+			rb_erase(&bfqq->pos_node, bfqq->pos_root);

1630

-+			bfqq->pos_root = NULL;

1631

-+		}

1632

-+	}

1633

-+}

1634

-+

1635

-+/* see the definition of bfq_async_charge_factor for details */

1636

-+static inline unsigned long bfq_serv_to_charge(struct request *rq,

1637

-+					       struct bfq_queue *bfqq)

1638

-+{

1639

-+	return blk_rq_sectors(rq) *

1640

-+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

1641

-+		bfq_async_charge_factor));

1642

-+}

1643

-+

1644

-+/**

1645

-+ * bfq_updated_next_req - update the queue after a new next_rq selection.

1646

-+ * @bfqd: the device data the queue belongs to.

1647

-+ * @bfqq: the queue to update.

1648

-+ *

1649

-+ * If the first request of a queue changes we make sure that the queue

1650

-+ * has enough budget to serve at least its first request (if the

1651

-+ * request has grown).  We do this because if the queue has not enough

1652

-+ * budget for its first request, it has to go through two dispatch

1653

-+ * rounds to actually get it dispatched.

1654

-+ */

1655

-+static void bfq_updated_next_req(struct bfq_data *bfqd,

1656

-+				 struct bfq_queue *bfqq)

1657

-+{

1658

-+	struct bfq_entity *entity = &bfqq->entity;

1659

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1660

-+	struct request *next_rq = bfqq->next_rq;

1661

-+	unsigned long new_budget;

1662

-+

1663

-+	if (next_rq == NULL)

1664

-+		return;

1665

-+

1666

-+	if (bfqq == bfqd->active_queue)

1667

-+		/*

1668

-+		 * In order not to break guarantees, budgets cannot be

1669

-+		 * changed after an entity has been selected.

1670

-+		 */

1671

-+		return;

1672

-+

1673

-+	BUG_ON(entity->tree != &st->active);

1674

-+	BUG_ON(entity == entity->sched_data->active_entity);

1675

-+

1676

-+	new_budget = max_t(unsigned long, bfqq->max_budget,

1677

-+			   bfq_serv_to_charge(next_rq, bfqq));

1678

-+	entity->budget = new_budget;

1679

-+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

1680

-+	bfq_activate_bfqq(bfqd, bfqq);

1681

-+}

1682

-+

1683

-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

1684

-+{

1685

-+	u64 dur;

1686

-+

1687

-+	if (bfqd->bfq_raising_max_time > 0)

1688

-+		return bfqd->bfq_raising_max_time;

1689

-+

1690

-+	dur = bfqd->RT_prod;

1691

-+	do_div(dur, bfqd->peak_rate);

1692

-+

1693

-+	return dur;

1694

-+}

1695

-+

1696

-+static void bfq_add_rq_rb(struct request *rq)

1697

-+{

1698

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1699

-+	struct bfq_entity *entity = &bfqq->entity;

1700

-+	struct bfq_data *bfqd = bfqq->bfqd;

1701

-+	struct request *next_rq, *prev;

1702

-+	unsigned long old_raising_coeff = bfqq->raising_coeff;

1703

-+	int idle_for_long_time = bfqq->budget_timeout +

1704

-+		bfqd->bfq_raising_min_idle_time < jiffies;

1705

-+

1706

-+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

1707

-+	bfqq->queued[rq_is_sync(rq)]++;

1708

-+	bfqd->queued++;

1709

-+

1710

-+	elv_rb_add(&bfqq->sort_list, rq);

1711

-+

1712

-+	/*

1713

-+	 * Check if this request is a better next-serve candidate.

1714

-+	 */

1715

-+	prev = bfqq->next_rq;

1716

-+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

1717

-+	BUG_ON(next_rq == NULL);

1718

-+	bfqq->next_rq = next_rq;

1719

-+

1720

-+	/*

1721

-+	 * Adjust priority tree position, if next_rq changes.

1722

-+	 */

1723

-+	if (prev != bfqq->next_rq)

1724

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

1725

-+

1726

-+	if (!bfq_bfqq_busy(bfqq)) {

1727

-+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

1728

-+			bfqq->soft_rt_next_start < jiffies;

1729

-+		entity->budget = max_t(unsigned long, bfqq->max_budget,

1730

-+				       bfq_serv_to_charge(next_rq, bfqq));

1731

-+

1732

-+		if (! bfqd->low_latency)

1733

-+			goto add_bfqq_busy;

1734

-+

1735

-+		/*

1736

-+		 * If the queue is not being boosted and has been idle

1737

-+		 * for enough time, start a weight-raising period

1738

-+		 */

1739

-+		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

1740

-+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1741

-+			if (idle_for_long_time)

1742

-+				bfqq->raising_cur_max_time =

1743

-+					bfq_wrais_duration(bfqd);

1744

-+			else

1745

-+				bfqq->raising_cur_max_time =

1746

-+					bfqd->bfq_raising_rt_max_time;

1747

-+			bfq_log_bfqq(bfqd, bfqq,

1748

-+				     "wrais starting at %llu msec,"

1749

-+				     "rais_max_time %u",

1750

-+				     bfqq->last_rais_start_finish,

1751

-+				     jiffies_to_msecs(bfqq->

1752

-+					raising_cur_max_time));

1753

-+		} else if (old_raising_coeff > 1) {

1754

-+			if (idle_for_long_time)

1755

-+				bfqq->raising_cur_max_time =

1756

-+					bfq_wrais_duration(bfqd);

1757

-+			else if (bfqq->raising_cur_max_time ==

1758

-+				 bfqd->bfq_raising_rt_max_time &&

1759

-+				 !soft_rt) {

1760

-+				bfqq->raising_coeff = 1;

1761

-+				bfq_log_bfqq(bfqd, bfqq,

1762

-+					     "wrais ending at %llu msec,"

1763

-+					     "rais_max_time %u",

1764

-+					     bfqq->last_rais_start_finish,

1765

-+					     jiffies_to_msecs(bfqq->

1766

-+						raising_cur_max_time));

1767

-+				}

1768

-+		}

1769

-+		if (old_raising_coeff != bfqq->raising_coeff)

1770

-+			entity->ioprio_changed = 1;

1771

-+add_bfqq_busy:

1772

-+		bfq_add_bfqq_busy(bfqd, bfqq);

1773

-+        } else {

1774

-+                if(bfqd->low_latency && old_raising_coeff == 1 &&

1775

-+			!rq_is_sync(rq) &&

1776

-+			bfqq->last_rais_start_finish +

1777

-+                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {

1778

-+                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;

1779

-+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

1780

-+

1781

-+			entity->ioprio_changed = 1;

1782

-+			bfq_log_bfqq(bfqd, bfqq,

1783

-+				     "non-idle wrais starting at %llu msec,"

1784

-+				     "rais_max_time %u",

1785

-+				     bfqq->last_rais_start_finish,

1786

-+				     jiffies_to_msecs(bfqq->

1787

-+					raising_cur_max_time));

1788

-+                }

1789

-+                bfq_updated_next_req(bfqd, bfqq);

1790

-+	}

1791

-+

1792

-+	if(bfqd->low_latency &&

1793

-+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

1794

-+		 idle_for_long_time))

1795

-+		bfqq->last_rais_start_finish = jiffies;

1796

-+}

1797

-+

1798

-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

1799

-+{

1800

-+	elv_rb_del(&bfqq->sort_list, rq);

1801

-+	bfqq->queued[rq_is_sync(rq)]--;

1802

-+	bfqq->bfqd->queued--;

1803

-+	bfq_add_rq_rb(rq);

1804

-+}

1805

-+

1806

-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

1807

-+					  struct bio *bio)

1808

-+{

1809

-+	struct task_struct *tsk = current;

1810

-+	struct bfq_io_cq *bic;

1811

-+	struct bfq_queue *bfqq;

1812

-+

1813

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

1814

-+	if (bic == NULL)

1815

-+		return NULL;

1816

-+

1817

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1818

-+	if (bfqq != NULL) {

1819

-+		sector_t sector = bio->bi_sector + bio_sectors(bio);

1820

-+

1821

-+		return elv_rb_find(&bfqq->sort_list, sector);

1822

-+	}

1823

-+

1824

-+	return NULL;

1825

-+}

1826

-+

1827

-+static void bfq_activate_request(struct request_queue *q, struct request *rq)

1828

-+{

1829

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1830

-+

1831

-+	bfqd->rq_in_driver++;

1832

-+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1833

-+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

1834

-+		(long long unsigned)bfqd->last_position);

1835

-+}

1836

-+

1837

-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

1838

-+{

1839

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1840

-+

1841

-+	WARN_ON(bfqd->rq_in_driver == 0);

1842

-+	bfqd->rq_in_driver--;

1843

-+}

1844

-+

1845

-+static void bfq_remove_request(struct request *rq)

1846

-+{

1847

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1848

-+	struct bfq_data *bfqd = bfqq->bfqd;

1849

-+

1850

-+	if (bfqq->next_rq == rq) {

1851

-+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

1852

-+		bfq_updated_next_req(bfqd, bfqq);

1853

-+	}

1854

-+

1855

-+	list_del_init(&rq->queuelist);

1856

-+	bfq_del_rq_rb(rq);

1857

-+

1858

-+	if (rq->cmd_flags & REQ_META) {

1859

-+		WARN_ON(bfqq->meta_pending == 0);

1860

-+		bfqq->meta_pending--;

1861

-+	}

1862

-+}

1863

-+

1864

-+static int bfq_merge(struct request_queue *q, struct request **req,

1865

-+		     struct bio *bio)

1866

-+{

1867

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1868

-+	struct request *__rq;

1869

-+

1870

-+	__rq = bfq_find_rq_fmerge(bfqd, bio);

1871

-+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

1872

-+		*req = __rq;

1873

-+		return ELEVATOR_FRONT_MERGE;

1874

-+	}

1875

-+

1876

-+	return ELEVATOR_NO_MERGE;

1877

-+}

1878

-+

1879

-+static void bfq_merged_request(struct request_queue *q, struct request *req,

1880

-+			       int type)

1881

-+{

1882

-+	if (type == ELEVATOR_FRONT_MERGE) {

1883

-+		struct bfq_queue *bfqq = RQ_BFQQ(req);

1884

-+

1885

-+		bfq_reposition_rq_rb(bfqq, req);

1886

-+	}

1887

-+}

1888

-+

1889

-+static void bfq_merged_requests(struct request_queue *q, struct request *rq,

1890

-+				struct request *next)

1891

-+{

1892

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

1893

-+

1894

-+	/*

1895

-+	 * Reposition in fifo if next is older than rq.

1896

-+	 */

1897

-+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1898

-+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1899

-+		list_move(&rq->queuelist, &next->queuelist);

1900

-+		rq_set_fifo_time(rq, rq_fifo_time(next));

1901

-+	}

1902

-+

1903

-+	if (bfqq->next_rq == next)

1904

-+		bfqq->next_rq = rq;

1905

-+

1906

-+	bfq_remove_request(next);

1907

-+}

1908

-+

1909

-+/* Must be called with bfqq != NULL */

1910

-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

1911

-+{

1912

-+	BUG_ON(bfqq == NULL);

1913

-+	bfqq->raising_coeff = 1;

1914

-+	bfqq->raising_cur_max_time = 0;

1915

-+	/* Trigger a weight change on the next activation of the queue */

1916

-+	bfqq->entity.ioprio_changed = 1;

1917

-+}

1918

-+

1919

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

1920

-+					struct bfq_group *bfqg)

1921

-+{

1922

-+	int i, j;

1923

-+

1924

-+	for (i = 0; i < 2; i++)

1925

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

1926

-+			if (bfqg->async_bfqq[i][j] != NULL)

1927

-+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

1928

-+	if (bfqg->async_idle_bfqq != NULL)

1929

-+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

1930

-+}

1931

-+

1932

-+static void bfq_end_raising(struct bfq_data *bfqd)

1933

-+{

1934

-+	struct bfq_queue *bfqq;

1935

-+

1936

-+	spin_lock_irq(bfqd->queue->queue_lock);

1937

-+

1938

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

1939

-+		bfq_bfqq_end_raising(bfqq);

1940

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

1941

-+		bfq_bfqq_end_raising(bfqq);

1942

-+	bfq_end_raising_async(bfqd);

1943

-+

1944

-+	spin_unlock_irq(bfqd->queue->queue_lock);

1945

-+}

1946

-+

1947

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

1948

-+			   struct bio *bio)

1949

-+{

1950

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

1951

-+	struct bfq_io_cq *bic;

1952

-+	struct bfq_queue *bfqq;

1953

-+

1954

-+	/*

1955

-+	 * Disallow merge of a sync bio into an async request.

1956

-+	 */

1957

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

1958

-+		return 0;

1959

-+

1960

-+	/*

1961

-+	 * Lookup the bfqq that this bio will be queued with. Allow

1962

-+	 * merge only if rq is queued there.

1963

-+	 * Queue lock is held here.

1964

-+	 */

1965

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

1966

-+	if (bic == NULL)

1967

-+		return 0;

1968

-+

1969

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

1970

-+	return bfqq == RQ_BFQQ(rq);

1971

-+}

1972

-+

1973

-+static void __bfq_set_active_queue(struct bfq_data *bfqd,

1974

-+				   struct bfq_queue *bfqq)

1975

-+{

1976

-+	if (bfqq != NULL) {

1977

-+		bfq_mark_bfqq_must_alloc(bfqq);

1978

-+		bfq_mark_bfqq_budget_new(bfqq);

1979

-+		bfq_clear_bfqq_fifo_expire(bfqq);

1980

-+

1981

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

1982

-+

1983

-+		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

1984

-+			     bfqq->entity.budget);

1985

-+	}

1986

-+

1987

-+	bfqd->active_queue = bfqq;

1988

-+}

1989

-+

1990

-+/*

1991

-+ * Get and set a new active queue for service.

1992

-+ */

1993

-+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

1994

-+					      struct bfq_queue *bfqq)

1995

-+{

1996

-+	if (!bfqq)

1997

-+		bfqq = bfq_get_next_queue(bfqd);

1998

-+	else

1999

-+		bfq_get_next_queue_forced(bfqd, bfqq);

2000

-+

2001

-+	__bfq_set_active_queue(bfqd, bfqq);

2002

-+	return bfqq;

2003

-+}

2004

-+

2005

-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2006

-+					  struct request *rq)

2007

-+{

2008

-+	if (blk_rq_pos(rq) >= bfqd->last_position)

2009

-+		return blk_rq_pos(rq) - bfqd->last_position;

2010

-+	else

2011

-+		return bfqd->last_position - blk_rq_pos(rq);

2012

-+}

2013

-+

2014

-+/*

2015

-+ * Return true if bfqq has no request pending and rq is close enough to

2016

-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2017

-+ * bfqq->next_rq

2018

-+ */

2019

-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2020

-+{

2021

-+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2022

-+}

2023

-+

2024

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2025

-+{

2026

-+	struct rb_root *root = &bfqd->rq_pos_tree;

2027

-+	struct rb_node *parent, *node;

2028

-+	struct bfq_queue *__bfqq;

2029

-+	sector_t sector = bfqd->last_position;

2030

-+

2031

-+	if (RB_EMPTY_ROOT(root))

2032

-+		return NULL;

2033

-+

2034

-+	/*

2035

-+	 * First, if we find a request starting at the end of the last

2036

-+	 * request, choose it.

2037

-+	 */

2038

-+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2039

-+	if (__bfqq != NULL)

2040

-+		return __bfqq;

2041

-+

2042

-+	/*

2043

-+	 * If the exact sector wasn't found, the parent of the NULL leaf

2044

-+	 * will contain the closest sector (rq_pos_tree sorted by next_request

2045

-+	 * position).

2046

-+	 */

2047

-+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2048

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2049

-+		return __bfqq;

2050

-+

2051

-+	if (blk_rq_pos(__bfqq->next_rq) < sector)

2052

-+		node = rb_next(&__bfqq->pos_node);

2053

-+	else

2054

-+		node = rb_prev(&__bfqq->pos_node);

2055

-+	if (node == NULL)

2056

-+		return NULL;

2057

-+

2058

-+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2059

-+	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2060

-+		return __bfqq;

2061

-+

2062

-+	return NULL;

2063

-+}

2064

-+

2065

-+/*

2066

-+ * bfqd - obvious

2067

-+ * cur_bfqq - passed in so that we don't decide that the current queue

2068

-+ *            is closely cooperating with itself.

2069

-+ *

2070

-+ * We are assuming that cur_bfqq has dispatched at least one request,

2071

-+ * and that bfqd->last_position reflects a position on the disk associated

2072

-+ * with the I/O issued by cur_bfqq.

2073

-+ */

2074

-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2075

-+					      struct bfq_queue *cur_bfqq)

2076

-+{

2077

-+	struct bfq_queue *bfqq;

2078

-+

2079

-+	if (bfq_class_idle(cur_bfqq))

2080

-+		return NULL;

2081

-+	if (!bfq_bfqq_sync(cur_bfqq))

2082

-+		return NULL;

2083

-+	if (BFQQ_SEEKY(cur_bfqq))

2084

-+		return NULL;

2085

-+

2086

-+	/* If device has only one backlogged bfq_queue, don't search. */

2087

-+	if (bfqd->busy_queues == 1)

2088

-+		return NULL;

2089

-+

2090

-+	/*

2091

-+	 * We should notice if some of the queues are cooperating, e.g.

2092

-+	 * working closely on the same area of the disk. In that case,

2093

-+	 * we can group them together and don't waste time idling.

2094

-+	 */

2095

-+	bfqq = bfqq_close(bfqd);

2096

-+	if (bfqq == NULL || bfqq == cur_bfqq)

2097

-+		return NULL;

2098

-+

2099

-+	/*

2100

-+	 * Do not merge queues from different bfq_groups.

2101

-+	*/

2102

-+	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2103

-+		return NULL;

2104

-+

2105

-+	/*

2106

-+	 * It only makes sense to merge sync queues.

2107

-+	 */

2108

-+	if (!bfq_bfqq_sync(bfqq))

2109

-+		return NULL;

2110

-+	if (BFQQ_SEEKY(bfqq))

2111

-+		return NULL;

2112

-+

2113

-+	/*

2114

-+	 * Do not merge queues of different priority classes.

2115

-+	 */

2116

-+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2117

-+		return NULL;

2118

-+

2119

-+	return bfqq;

2120

-+}

2121

-+

2122

-+/*

2123

-+ * If enough samples have been computed, return the current max budget

2124

-+ * stored in bfqd, which is dynamically updated according to the

2125

-+ * estimated disk peak rate; otherwise return the default max budget

2126

-+ */

2127

-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2128

-+{

2129

-+	if (bfqd->budgets_assigned < 194)

2130

-+		return bfq_default_max_budget;

2131

-+	else

2132

-+		return bfqd->bfq_max_budget;

2133

-+}

2134

-+

2135

-+/*

2136

-+ * Return min budget, which is a fraction of the current or default

2137

-+ * max budget (trying with 1/32)

2138

-+ */

2139

-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2140

-+{

2141

-+	if (bfqd->budgets_assigned < 194)

2142

-+		return bfq_default_max_budget / 32;

2143

-+	else

2144

-+		return bfqd->bfq_max_budget / 32;

2145

-+}

2146

-+

2147

-+/*

2148

-+ * Decides whether idling should be done for given device and

2149

-+ * given active queue.

2150

-+ */

2151

-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

2152

-+					   struct bfq_queue *active_bfqq)

2153

-+{

2154

-+	if (active_bfqq == NULL)

2155

-+		return false;

2156

-+	/*

2157

-+	 * If device is SSD it has no seek penalty, disable idling; but

2158

-+	 * do so only if:

2159

-+	 * - device does not support queuing, otherwise we still have

2160

-+	 *   a problem with sync vs async workloads;

2161

-+	 * - the queue is not weight-raised, to preserve guarantees.

2162

-+	 */

2163

-+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

2164

-+		active_bfqq->raising_coeff == 1);

2165

-+}

2166

-+

2167

-+static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2168

-+{

2169

-+	struct bfq_queue *bfqq = bfqd->active_queue;

2170

-+	struct bfq_io_cq *bic;

2171

-+	unsigned long sl;

2172

-+

2173

-+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2174

-+

2175

-+	/* Tasks have exited, don't wait. */

2176

-+	bic = bfqd->active_bic;

2177

-+	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2178

-+		return;

2179

-+

2180

-+	bfq_mark_bfqq_wait_request(bfqq);

2181

-+

2182

-+	/*

2183

-+	 * We don't want to idle for seeks, but we do want to allow

2184

-+	 * fair distribution of slice time for a process doing back-to-back

2185

-+	 * seeks. So allow a little bit of time for him to submit a new rq.

2186

-+	 *

2187

-+	 * To prevent processes with (partly) seeky workloads from

2188

-+	 * being too ill-treated, grant them a small fraction of the

2189

-+	 * assigned budget before reducing the waiting time to

2190

-+	 * BFQ_MIN_TT. This happened to help reduce latency.

2191

-+	 */

2192

-+	sl = bfqd->bfq_slice_idle;

2193

-+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

2194

-+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

2195

-+	    bfqq->raising_coeff == 1)

2196

-+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2197

-+	else if (bfqq->raising_coeff > 1)

2198

-+		sl = sl * 3;

2199

-+	bfqd->last_idling_start = ktime_get();

2200

-+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2201

-+	bfq_log(bfqd, "arm idle: %u/%u ms",

2202

-+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2203

-+}

2204

-+

2205

-+/*

2206

-+ * Set the maximum time for the active queue to consume its

2207

-+ * budget. This prevents seeky processes from lowering the disk

2208

-+ * throughput (always guaranteed with a time slice scheme as in CFQ).

2209

-+ */

2210

-+static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2211

-+{

2212

-+	struct bfq_queue *bfqq = bfqd->active_queue;

2213

-+	unsigned int timeout_coeff;

2214

-+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

2215

-+		timeout_coeff = 1;

2216

-+	else

2217

-+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2218

-+

2219

-+	bfqd->last_budget_start = ktime_get();

2220

-+

2221

-+	bfq_clear_bfqq_budget_new(bfqq);

2222

-+	bfqq->budget_timeout = jiffies +

2223

-+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2224

-+

2225

-+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2226

-+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2227

-+		timeout_coeff));

2228

-+}

2229

-+

2230

-+/*

2231

-+ * Move request from internal lists to the request queue dispatch list.

2232

-+ */

2233

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2234

-+{

2235

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

2236

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2237

-+

2238

-+	bfq_remove_request(rq);

2239

-+	bfqq->dispatched++;

2240

-+	elv_dispatch_sort(q, rq);

2241

-+

2242

-+	if (bfq_bfqq_sync(bfqq))

2243

-+		bfqd->sync_flight++;

2244

-+}

2245

-+

2246

-+/*

2247

-+ * Return expired entry, or NULL to just start from scratch in rbtree.

2248

-+ */

2249

-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2250

-+{

2251

-+	struct request *rq = NULL;

2252

-+

2253

-+	if (bfq_bfqq_fifo_expire(bfqq))

2254

-+		return NULL;

2255

-+

2256

-+	bfq_mark_bfqq_fifo_expire(bfqq);

2257

-+

2258

-+	if (list_empty(&bfqq->fifo))

2259

-+		return NULL;

2260

-+

2261

-+	rq = rq_entry_fifo(bfqq->fifo.next);

2262

-+

2263

-+	if (time_before(jiffies, rq_fifo_time(rq)))

2264

-+		return NULL;

2265

-+

2266

-+	return rq;

2267

-+}

2268

-+

2269

-+/*

2270

-+ * Must be called with the queue_lock held.

2271

-+ */

2272

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

2273

-+{

2274

-+	int process_refs, io_refs;

2275

-+

2276

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2277

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2278

-+	BUG_ON(process_refs < 0);

2279

-+	return process_refs;

2280

-+}

2281

-+

2282

-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2283

-+{

2284

-+	int process_refs, new_process_refs;

2285

-+	struct bfq_queue *__bfqq;

2286

-+

2287

-+	/*

2288

-+	 * If there are no process references on the new_bfqq, then it is

2289

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2290

-+	 * may have dropped their last reference (not just their last process

2291

-+	 * reference).

2292

-+	 */

2293

-+	if (!bfqq_process_refs(new_bfqq))

2294

-+		return;

2295

-+

2296

-+	/* Avoid a circular list and skip interim queue merges. */

2297

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

2298

-+		if (__bfqq == bfqq)

2299

-+			return;

2300

-+		new_bfqq = __bfqq;

2301

-+	}

2302

-+

2303

-+	process_refs = bfqq_process_refs(bfqq);

2304

-+	new_process_refs = bfqq_process_refs(new_bfqq);

2305

-+	/*

2306

-+	 * If the process for the bfqq has gone away, there is no

2307

-+	 * sense in merging the queues.

2308

-+	 */

2309

-+	if (process_refs == 0 || new_process_refs == 0)

2310

-+		return;

2311

-+

2312

-+	/*

2313

-+	 * Merge in the direction of the lesser amount of work.

2314

-+	 */

2315

-+	if (new_process_refs >= process_refs) {

2316

-+		bfqq->new_bfqq = new_bfqq;

2317

-+		atomic_add(process_refs, &new_bfqq->ref);

2318

-+	} else {

2319

-+		new_bfqq->new_bfqq = bfqq;

2320

-+		atomic_add(new_process_refs, &bfqq->ref);

2321

-+	}

2322

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2323

-+		new_bfqq->pid);

2324

-+}

2325

-+

2326

-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2327

-+{

2328

-+	struct bfq_entity *entity = &bfqq->entity;

2329

-+	return entity->budget - entity->service;

2330

-+}

2331

-+

2332

-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2333

-+{

2334

-+	BUG_ON(bfqq != bfqd->active_queue);

2335

-+

2336

-+	__bfq_bfqd_reset_active(bfqd);

2337

-+

2338

-+	/*

2339

-+	 * If this bfqq is shared between multiple processes, check

2340

-+	 * to make sure that those processes are still issuing I/Os

2341

-+	 * within the mean seek distance. If not, it may be time to

2342

-+	 * break the queues apart again.

2343

-+	 */

2344

-+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2345

-+		bfq_mark_bfqq_split_coop(bfqq);

2346

-+

2347

-+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2348

-+		/*

2349

-+		 * overloading budget_timeout field to store when

2350

-+		 * the queue remains with no backlog, used by

2351

-+		 * the weight-raising mechanism

2352

-+		 */

2353

-+		bfqq->budget_timeout = jiffies ;

2354

-+		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2355

-+	} else {

2356

-+		bfq_activate_bfqq(bfqd, bfqq);

2357

-+		/*

2358

-+		 * Resort priority tree of potential close cooperators.

2359

-+		 */

2360

-+		bfq_rq_pos_tree_add(bfqd, bfqq);

2361

-+	}

2362

-+}

2363

-+

2364

-+/**

2365

-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2366

-+ * @bfqd: device data.

2367

-+ * @bfqq: queue to update.

2368

-+ * @reason: reason for expiration.

2369

-+ *

2370

-+ * Handle the feedback on @bfqq budget.  See the body for detailed

2371

-+ * comments.

2372

-+ */

2373

-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2374

-+				     struct bfq_queue *bfqq,

2375

-+				     enum bfqq_expiration reason)

2376

-+{

2377

-+	struct request *next_rq;

2378

-+	unsigned long budget, min_budget;

2379

-+

2380

-+	budget = bfqq->max_budget;

2381

-+	min_budget = bfq_min_budget(bfqd);

2382

-+

2383

-+	BUG_ON(bfqq != bfqd->active_queue);

2384

-+

2385

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2386

-+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2387

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2388

-+		budget, bfq_min_budget(bfqd));

2389

-+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2390

-+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));

2391

-+

2392

-+	if (bfq_bfqq_sync(bfqq)) {

2393

-+		switch (reason) {

2394

-+		/*

2395

-+		 * Caveat: in all the following cases we trade latency

2396

-+		 * for throughput.

2397

-+		 */

2398

-+		case BFQ_BFQQ_TOO_IDLE:

2399

-+			/*

2400

-+			 * This is the only case where we may reduce

2401

-+			 * the budget: if there is no requets of the

2402

-+			 * process still waiting for completion, then

2403

-+			 * we assume (tentatively) that the timer has

2404

-+			 * expired because the batch of requests of

2405

-+			 * the process could have been served with a

2406

-+			 * smaller budget.  Hence, betting that

2407

-+			 * process will behave in the same way when it

2408

-+			 * becomes backlogged again, we reduce its

2409

-+			 * next budget.  As long as we guess right,

2410

-+			 * this budget cut reduces the latency

2411

-+			 * experienced by the process.

2412

-+			 *

2413

-+			 * However, if there are still outstanding

2414

-+			 * requests, then the process may have not yet

2415

-+			 * issued its next request just because it is

2416

-+			 * still waiting for the completion of some of

2417

-+			 * the still oustanding ones.  So in this

2418

-+			 * subcase we do not reduce its budget, on the

2419

-+			 * contrary we increase it to possibly boost

2420

-+			 * the throughput, as discussed in the

2421

-+			 * comments to the BUDGET_TIMEOUT case.

2422

-+			 */

2423

-+			if (bfqq->dispatched > 0) /* still oustanding reqs */

2424

-+				budget = min(budget * 2, bfqd->bfq_max_budget);

2425

-+			else {

2426

-+				if (budget > 5 * min_budget)

2427

-+					budget -= 4 * min_budget;

2428

-+				else

2429

-+					budget = min_budget;

2430

-+			}

2431

-+			break;

2432

-+		case BFQ_BFQQ_BUDGET_TIMEOUT:

2433

-+			/*

2434

-+			 * We double the budget here because: 1) it

2435

-+			 * gives the chance to boost the throughput if

2436

-+			 * this is not a seeky process (which may have

2437

-+			 * bumped into this timeout because of, e.g.,

2438

-+			 * ZBR), 2) together with charge_full_budget

2439

-+			 * it helps give seeky processes higher

2440

-+			 * timestamps, and hence be served less

2441

-+			 * frequently.

2442

-+			 */

2443

-+			budget = min(budget * 2, bfqd->bfq_max_budget);

2444

-+			break;

2445

-+		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2446

-+			/*

2447

-+			 * The process still has backlog, and did not

2448

-+			 * let either the budget timeout or the disk

2449

-+			 * idling timeout expire. Hence it is not

2450

-+			 * seeky, has a short thinktime and may be

2451

-+			 * happy with a higher budget too. So

2452

-+			 * definitely increase the budget of this good

2453

-+			 * candidate to boost the disk throughput.

2454

-+			 */

2455

-+			budget = min(budget * 4, bfqd->bfq_max_budget);

2456

-+			break;

2457

-+		case BFQ_BFQQ_NO_MORE_REQUESTS:

2458

-+		       /*

2459

-+			* Leave the budget unchanged.

2460

-+			*/

2461

-+		default:

2462

-+			return;

2463

-+		}

2464

-+	} else /* async queue */

2465

-+	    /* async queues get always the maximum possible budget

2466

-+	     * (their ability to dispatch is limited by

2467

-+	     * @bfqd->bfq_max_budget_async_rq).

2468

-+	     */

2469

-+		budget = bfqd->bfq_max_budget;

2470

-+

2471

-+	bfqq->max_budget = budget;

2472

-+

2473

-+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2474

-+	    bfqq->max_budget > bfqd->bfq_max_budget)

2475

-+		bfqq->max_budget = bfqd->bfq_max_budget;

2476

-+

2477

-+	/*

2478

-+	 * Make sure that we have enough budget for the next request.

2479

-+	 * Since the finish time of the bfqq must be kept in sync with

2480

-+	 * the budget, be sure to call __bfq_bfqq_expire() after the

2481

-+	 * update.

2482

-+	 */

2483

-+	next_rq = bfqq->next_rq;

2484

-+	if (next_rq != NULL)

2485

-+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2486

-+					    bfq_serv_to_charge(next_rq, bfqq));

2487

-+	else

2488

-+		bfqq->entity.budget = bfqq->max_budget;

2489

-+

2490

-+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2491

-+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2492

-+			bfqq->entity.budget);

2493

-+}

2494

-+

2495

-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2496

-+{

2497

-+	unsigned long max_budget;

2498

-+

2499

-+	/*

2500

-+	 * The max_budget calculated when autotuning is equal to the

2501

-+	 * amount of sectors transfered in timeout_sync at the

2502

-+	 * estimated peak rate.

2503

-+	 */

2504

-+	max_budget = (unsigned long)(peak_rate * 1000 *

2505

-+				     timeout >> BFQ_RATE_SHIFT);

2506

-+

2507

-+	return max_budget;

2508

-+}

2509

-+

2510

-+/*

2511

-+ * In addition to updating the peak rate, checks whether the process

2512

-+ * is "slow", and returns 1 if so. This slow flag is used, in addition

2513

-+ * to the budget timeout, to reduce the amount of service provided to

2514

-+ * seeky processes, and hence reduce their chances to lower the

2515

-+ * throughput. See the code for more details.

2516

-+ */

2517

-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2518

-+				int compensate, enum bfqq_expiration reason)

2519

-+{

2520

-+	u64 bw, usecs, expected, timeout;

2521

-+	ktime_t delta;

2522

-+	int update = 0;

2523

-+

2524

-+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2525

-+		return 0;

2526

-+

2527

-+	if (compensate)

2528

-+		delta = bfqd->last_idling_start;

2529

-+	else

2530

-+		delta = ktime_get();

2531

-+	delta = ktime_sub(delta, bfqd->last_budget_start);

2532

-+	usecs = ktime_to_us(delta);

2533

-+

2534

-+	/* Don't trust short/unrealistic values. */

2535

-+	if (usecs < 100 || usecs >= LONG_MAX)

2536

-+		return 0;

2537

-+

2538

-+	/*

2539

-+	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2540

-+	 * value to store the peak rate, in sectors per usec in fixed

2541

-+	 * point math.  We do so to have enough precision in the estimate

2542

-+	 * and to avoid overflows.

2543

-+	 */

2544

-+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2545

-+	do_div(bw, (unsigned long)usecs);

2546

-+

2547

-+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2548

-+

2549

-+	/*

2550

-+	 * Use only long (> 20ms) intervals to filter out spikes for

2551

-+	 * the peak rate estimation.

2552

-+	 */

2553

-+	if (usecs > 20000) {

2554

-+		if (bw > bfqd->peak_rate ||

2555

-+		   (!BFQQ_SEEKY(bfqq) &&

2556

-+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2557

-+			bfq_log(bfqd, "measured bw =%llu", bw);

2558

-+			/*

2559

-+			 * To smooth oscillations use a low-pass filter with

2560

-+			 * alpha=7/8, i.e.,

2561

-+			 * new_rate = (7/8) * old_rate + (1/8) * bw

2562

-+			 */

2563

-+			do_div(bw, 8);

2564

-+			if (bw == 0)

2565

-+				return 0;

2566

-+			bfqd->peak_rate *= 7;

2567

-+			do_div(bfqd->peak_rate, 8);

2568

-+			bfqd->peak_rate += bw;

2569

-+			update = 1;

2570

-+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2571

-+		}

2572

-+

2573

-+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2574

-+

2575

-+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2576

-+			bfqd->peak_rate_samples++;

2577

-+

2578

-+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

2579

-+		    update && bfqd->bfq_user_max_budget == 0) {

2580

-+			bfqd->bfq_max_budget =

2581

-+				bfq_calc_max_budget(bfqd->peak_rate, timeout);

2582

-+			bfq_log(bfqd, "new max_budget=%lu",

2583

-+				bfqd->bfq_max_budget);

2584

-+		}

2585

-+	}

2586

-+

2587

-+	/*

2588

-+	 * If the process has been served for a too short time

2589

-+	 * interval to let its possible sequential accesses prevail on

2590

-+	 * the initial seek time needed to move the disk head on the

2591

-+	 * first sector it requested, then give the process a chance

2592

-+	 * and for the moment return false.

2593

-+	 */

2594

-+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

2595

-+		return 0;

2596

-+

2597

-+	/*

2598

-+	 * A process is considered ``slow'' (i.e., seeky, so that we

2599

-+	 * cannot treat it fairly in the service domain, as it would

2600

-+	 * slow down too much the other processes) if, when a slice

2601

-+	 * ends for whatever reason, it has received service at a

2602

-+	 * rate that would not be high enough to complete the budget

2603

-+	 * before the budget timeout expiration.

2604

-+	 */

2605

-+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

2606

-+

2607

-+	/*

2608

-+	 * Caveat: processes doing IO in the slower disk zones will

2609

-+	 * tend to be slow(er) even if not seeky. And the estimated

2610

-+	 * peak rate will actually be an average over the disk

2611

-+	 * surface. Hence, to not be too harsh with unlucky processes,

2612

-+	 * we keep a budget/3 margin of safety before declaring a

2613

-+	 * process slow.

2614

-+	 */

2615

-+	return expected > (4 * bfqq->entity.budget) / 3;

2616

-+}

2617

-+

2618

-+/**

2619

-+ * bfq_bfqq_expire - expire a queue.

2620

-+ * @bfqd: device owning the queue.

2621

-+ * @bfqq: the queue to expire.

2622

-+ * @compensate: if true, compensate for the time spent idling.

2623

-+ * @reason: the reason causing the expiration.

2624

-+ *

2625

-+ *

2626

-+ * If the process associated to the queue is slow (i.e., seeky), or in

2627

-+ * case of budget timeout, or, finally, if it is async, we

2628

-+ * artificially charge it an entire budget (independently of the

2629

-+ * actual service it received). As a consequence, the queue will get

2630

-+ * higher timestamps than the correct ones upon reactivation, and

2631

-+ * hence it will be rescheduled as if it had received more service

2632

-+ * than what it actually received. In the end, this class of processes

2633

-+ * will receive less service in proportion to how slowly they consume

2634

-+ * their budgets (and hence how seriously they tend to lower the

2635

-+ * throughput).

2636

-+ *

2637

-+ * In contrast, when a queue expires because it has been idling for

2638

-+ * too much or because it exhausted its budget, we do not touch the

2639

-+ * amount of service it has received. Hence when the queue will be

2640

-+ * reactivated and its timestamps updated, the latter will be in sync

2641

-+ * with the actual service received by the queue until expiration.

2642

-+ *

2643

-+ * Charging a full budget to the first type of queues and the exact

2644

-+ * service to the others has the effect of using the WF2Q+ policy to

2645

-+ * schedule the former on a timeslice basis, without violating the

2646

-+ * service domain guarantees of the latter.

2647

-+ */

2648

-+static void bfq_bfqq_expire(struct bfq_data *bfqd,

2649

-+			    struct bfq_queue *bfqq,

2650

-+			    int compensate,

2651

-+			    enum bfqq_expiration reason)

2652

-+{

2653

-+	int slow;

2654

-+	BUG_ON(bfqq != bfqd->active_queue);

2655

-+

2656

-+	/* Update disk peak rate for autotuning and check whether the

2657

-+	 * process is slow (see bfq_update_peak_rate).

2658

-+	 */

2659

-+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

2660

-+

2661

-+	/*

2662

-+	 * As above explained, 'punish' slow (i.e., seeky), timed-out

2663

-+	 * and async queues, to favor sequential sync workloads.

2664

-+	 *

2665

-+	 * Processes doing IO in the slower disk zones will tend to be

2666

-+	 * slow(er) even if not seeky. Hence, since the estimated peak

2667

-+	 * rate is actually an average over the disk surface, these

2668

-+	 * processes may timeout just for bad luck. To avoid punishing

2669

-+	 * them we do not charge a full budget to a process that

2670

-+	 * succeeded in consuming at least 2/3 of its budget.

2671

-+	 */

2672

-+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

2673

-+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

2674

-+		bfq_bfqq_charge_full_budget(bfqq);

2675

-+

2676

-+	if (bfqd->low_latency && bfqq->raising_coeff == 1)

2677

-+		bfqq->last_rais_start_finish = jiffies;

2678

-+

2679

-+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

2680

-+	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)

2681

-+		bfqq->soft_rt_next_start =

2682

-+			jiffies +

2683

-+			HZ * bfqq->entity.service /

2684

-+			bfqd->bfq_raising_max_softrt_rate;

2685

-+		else

2686

-+			bfqq->soft_rt_next_start = -1; /* infinity */

2687

-+	}

2688

-+	bfq_log_bfqq(bfqd, bfqq,

2689

-+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

2690

-+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

2691

-+

2692

-+	/* Increase, decrease or leave budget unchanged according to reason */

2693

-+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

2694

-+	__bfq_bfqq_expire(bfqd, bfqq);

2695

-+}

2696

-+

2697

-+/*

2698

-+ * Budget timeout is not implemented through a dedicated timer, but

2699

-+ * just checked on request arrivals and completions, as well as on

2700

-+ * idle timer expirations.

2701

-+ */

2702

-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

2703

-+{

2704

-+	if (bfq_bfqq_budget_new(bfqq))

2705

-+		return 0;

2706

-+

2707

-+	if (time_before(jiffies, bfqq->budget_timeout))

2708

-+		return 0;

2709

-+

2710

-+	return 1;

2711

-+}

2712

-+

2713

-+/*

2714

-+ * If we expire a queue that is waiting for the arrival of a new

2715

-+ * request, we may prevent the fictitious timestamp backshifting that

2716

-+ * allows the guarantees of the queue to be preserved (see [1] for

2717

-+ * this tricky aspect). Hence we return true only if this condition

2718

-+ * does not hold, or if the queue is slow enough to deserve only to be

2719

-+ * kicked off for preserving a high throughput.

2720

-+*/

2721

-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

2722

-+{

2723

-+	bfq_log_bfqq(bfqq->bfqd, bfqq,

2724

-+		"may_budget_timeout: wr %d left %d timeout %d",

2725

-+		bfq_bfqq_wait_request(bfqq),

2726

-+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

2727

-+		bfq_bfqq_budget_timeout(bfqq));

2728

-+

2729

-+	return (!bfq_bfqq_wait_request(bfqq) ||

2730

-+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

2731

-+		&&

2732

-+		bfq_bfqq_budget_timeout(bfqq);

2733

-+}

2734

-+

2735

-+/*

2736

-+ * If the active queue is empty, but it is sync and either of the following

2737

-+ * conditions holds, then: 1) the queue must remain active and cannot be

2738

-+ * expired, and 2) the disk must be idled to wait for the possible arrival

2739

-+ * of a new request for the queue. The conditions are:

2740

-+ * - the device is rotational and not performing NCQ, and the queue has its

2741

-+ *   idle window set (in this case, waiting for a new request for the queue

2742

-+ *   is likely to boost the disk throughput);

2743

-+ * - the queue is weight-raised (waiting for the request is necessary for

2744

-+ *   providing the queue with fairness and latency guarantees).

2745

-+ */

2746

-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

2747

-+				      int budg_timeout)

2748

-+{

2749

-+	struct bfq_data *bfqd = bfqq->bfqd;

2750

-+

2751

-+	return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&

2752

-+		bfqd->bfq_slice_idle != 0 &&

2753

-+		((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&

2754

-+		  !blk_queue_nonrot(bfqd->queue))

2755

-+		 || bfqq->raising_coeff > 1) &&

2756

-+		(bfqd->rq_in_driver == 0 ||

2757

-+				budg_timeout ||

2758

-+                                bfqq->raising_coeff > 1) &&

2759

-+                !bfq_close_cooperator(bfqd, bfqq) &&

2760

-+                (!bfq_bfqq_coop(bfqq) ||

2761

-+			!bfq_bfqq_some_coop_idle(bfqq)) &&

2762

-+		!bfq_queue_nonrot_noidle(bfqd, bfqq));

2763

-+}

2764

-+

2765

-+/*

2766

-+ * Select a queue for service.  If we have a current active queue,

2767

-+ * check whether to continue servicing it, or retrieve and set a new one.

2768

-+ */

2769

-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

2770

-+{

2771

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

2772

-+	struct request *next_rq;

2773

-+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

2774

-+	int budg_timeout;

2775

-+

2776

-+	bfqq = bfqd->active_queue;

2777

-+	if (bfqq == NULL)

2778

-+		goto new_queue;

2779

-+

2780

-+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

2781

-+

2782

-+	/*

2783

-+         * If another queue has a request waiting within our mean seek

2784

-+         * distance, let it run. The expire code will check for close

2785

-+         * cooperators and put the close queue at the front of the

2786

-+         * service tree. If possible, merge the expiring queue with the

2787

-+         * new bfqq.

2788

-+         */

2789

-+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

2790

-+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

2791

-+                bfq_setup_merge(bfqq, new_bfqq);

2792

-+

2793

-+	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

2794

-+	if (budg_timeout &&

2795

-+	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

2796

-+		goto expire;

2797

-+

2798

-+	next_rq = bfqq->next_rq;

2799

-+	/*

2800

-+	 * If bfqq has requests queued and it has enough budget left to

2801

-+	 * serve them, keep the queue, otherwise expire it.

2802

-+	 */

2803

-+	if (next_rq != NULL) {

2804

-+		if (bfq_serv_to_charge(next_rq, bfqq) >

2805

-+			bfq_bfqq_budget_left(bfqq)) {

2806

-+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

2807

-+			goto expire;

2808

-+		} else {

2809

-+			/*

2810

-+			 * The idle timer may be pending because we may not

2811

-+			 * disable disk idling even when a new request arrives

2812

-+			 */

2813

-+			if (timer_pending(&bfqd->idle_slice_timer)) {

2814

-+				/*

2815

-+				 * If we get here: 1) at least a new request

2816

-+				 * has arrived but we have not disabled the

2817

-+				 * timer because the request was too small,

2818

-+				 * 2) then the block layer has unplugged the

2819

-+				 * device, causing the dispatch to be invoked.

2820

-+				 *

2821

-+				 * Since the device is unplugged, now the

2822

-+				 * requests are probably large enough to

2823

-+				 * provide a reasonable throughput.

2824

-+				 * So we disable idling.

2825

-+				 */

2826

-+				bfq_clear_bfqq_wait_request(bfqq);

2827

-+				del_timer(&bfqd->idle_slice_timer);

2828

-+			}

2829

-+			if (new_bfqq == NULL)

2830

-+				goto keep_queue;

2831

-+			else

2832

-+				goto expire;

2833

-+		}

2834

-+	}

2835

-+

2836

-+	/*

2837

-+	 * No requests pending.  If there is no cooperator, and the active

2838

-+	 * queue still has requests in flight or is idling for a new request,

2839

-+	 * then keep it.

2840

-+	 */

2841

-+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

2842

-+	    (bfqq->dispatched != 0 &&

2843

-+	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

2844

-+	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

2845

-+		bfqq = NULL;

2846

-+		goto keep_queue;

2847

-+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

2848

-+		/*

2849

-+		 * Expiring the queue because there is a close cooperator,

2850

-+		 * cancel timer.

2851

-+		 */

2852

-+		bfq_clear_bfqq_wait_request(bfqq);

2853

-+		del_timer(&bfqd->idle_slice_timer);

2854

-+	}

2855

-+

2856

-+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

2857

-+expire:

2858

-+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

2859

-+new_queue:

2860

-+	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

2861

-+	bfq_log(bfqd, "select_queue: new queue %d returned",

2862

-+		bfqq != NULL ? bfqq->pid : 0);

2863

-+keep_queue:

2864

-+	return bfqq;

2865

-+}

2866

-+

2867

-+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2868

-+{

2869

-+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

2870

-+		struct bfq_entity *entity = &bfqq->entity;

2871

-+

2872

-+		bfq_log_bfqq(bfqd, bfqq,

2873

-+			"raising period dur %u/%u msec, "

2874

-+			"old raising coeff %u, w %d(%d)",

2875

-+			jiffies_to_msecs(jiffies -

2876

-+				bfqq->last_rais_start_finish),

2877

-+			jiffies_to_msecs(bfqq->raising_cur_max_time),

2878

-+			bfqq->raising_coeff,

2879

-+			bfqq->entity.weight, bfqq->entity.orig_weight);

2880

-+

2881

-+		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=

2882

-+			entity->orig_weight * bfqq->raising_coeff);

2883

-+		if(entity->ioprio_changed)

2884

-+			bfq_log_bfqq(bfqd, bfqq,

2885

-+			"WARN: pending prio change");

2886

-+		/*

2887

-+		 * If too much time has elapsed from the beginning

2888

-+		 * of this weight-raising period and process is not soft

2889

-+		 * real-time, stop it

2890

-+		 */

2891

-+		if (jiffies - bfqq->last_rais_start_finish >

2892

-+			bfqq->raising_cur_max_time) {

2893

-+			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

2894

-+				bfqq->soft_rt_next_start < jiffies;

2895

-+

2896

-+			bfqq->last_rais_start_finish = jiffies;

2897

-+			if (soft_rt)

2898

-+				bfqq->raising_cur_max_time =

2899

-+					bfqd->bfq_raising_rt_max_time;

2900

-+			else {

2901

-+				bfq_log_bfqq(bfqd, bfqq,

2902

-+					     "wrais ending at %llu msec,"

2903

-+					     "rais_max_time %u",

2904

-+					     bfqq->last_rais_start_finish,

2905

-+					     jiffies_to_msecs(bfqq->

2906

-+						raising_cur_max_time));

2907

-+				bfq_bfqq_end_raising(bfqq);

2908

-+				__bfq_entity_update_weight_prio(

2909

-+					bfq_entity_service_tree(entity),

2910

-+					entity);

2911

-+			}

2912

-+		}

2913

-+	}

2914

-+}

2915

-+

2916

-+/*

2917

-+ * Dispatch one request from bfqq, moving it to the request queue

2918

-+ * dispatch list.

2919

-+ */

2920

-+static int bfq_dispatch_request(struct bfq_data *bfqd,

2921

-+				struct bfq_queue *bfqq)

2922

-+{

2923

-+	int dispatched = 0;

2924

-+	struct request *rq;

2925

-+	unsigned long service_to_charge;

2926

-+

2927

-+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

2928

-+

2929

-+	/* Follow expired path, else get first next available. */

2930

-+	rq = bfq_check_fifo(bfqq);

2931

-+	if (rq == NULL)

2932

-+		rq = bfqq->next_rq;

2933

-+	service_to_charge = bfq_serv_to_charge(rq, bfqq);

2934

-+

2935

-+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

2936

-+		/*

2937

-+		 * This may happen if the next rq is chosen

2938

-+		 * in fifo order instead of sector order.

2939

-+		 * The budget is properly dimensioned

2940

-+		 * to be always sufficient to serve the next request

2941

-+		 * only if it is chosen in sector order. The reason is

2942

-+		 * that it would be quite inefficient and little useful

2943

-+		 * to always make sure that the budget is large enough

2944

-+		 * to serve even the possible next rq in fifo order.

2945

-+		 * In fact, requests are seldom served in fifo order.

2946

-+		 *

2947

-+		 * Expire the queue for budget exhaustion, and

2948

-+		 * make sure that the next act_budget is enough

2949

-+		 * to serve the next request, even if it comes

2950

-+		 * from the fifo expired path.

2951

-+		 */

2952

-+		bfqq->next_rq = rq;

2953

-+		/*

2954

-+		 * Since this dispatch is failed, make sure that

2955

-+		 * a new one will be performed

2956

-+		 */

2957

-+		if (!bfqd->rq_in_driver)

2958

-+			bfq_schedule_dispatch(bfqd);

2959

-+		goto expire;

2960

-+	}

2961

-+

2962

-+	/* Finally, insert request into driver dispatch list. */

2963

-+	bfq_bfqq_served(bfqq, service_to_charge);

2964

-+	bfq_dispatch_insert(bfqd->queue, rq);

2965

-+

2966

-+	update_raising_data(bfqd, bfqq);

2967

-+

2968

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "

2969

-+			"budg left %lu",

2970

-+			blk_rq_sectors(rq),

2971

-+			(long long unsigned)blk_rq_pos(rq),

2972

-+			bfq_bfqq_budget_left(bfqq));

2973

-+

2974

-+	dispatched++;

2975

-+

2976

-+	if (bfqd->active_bic == NULL) {

2977

-+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

2978

-+		bfqd->active_bic = RQ_BIC(rq);

2979

-+	}

2980

-+

2981

-+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

2982

-+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

2983

-+	    bfq_class_idle(bfqq)))

2984

-+		goto expire;

2985

-+

2986

-+	return dispatched;

2987

-+

2988

-+expire:

2989

-+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

2990

-+	return dispatched;

2991

-+}

2992

-+

2993

-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

2994

-+{

2995

-+	int dispatched = 0;

2996

-+

2997

-+	while (bfqq->next_rq != NULL) {

2998

-+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

2999

-+		dispatched++;

3000

-+	}

3001

-+

3002

-+	BUG_ON(!list_empty(&bfqq->fifo));

3003

-+	return dispatched;

3004

-+}

3005

-+

3006

-+/*

3007

-+ * Drain our current requests.  Used for barriers and when switching

3008

-+ * io schedulers on-the-fly.

3009

-+ */

3010

-+static int bfq_forced_dispatch(struct bfq_data *bfqd)

3011

-+{

3012

-+	struct bfq_queue *bfqq, *n;

3013

-+	struct bfq_service_tree *st;

3014

-+	int dispatched = 0;

3015

-+

3016

-+	bfqq = bfqd->active_queue;

3017

-+	if (bfqq != NULL)

3018

-+		__bfq_bfqq_expire(bfqd, bfqq);

3019

-+

3020

-+	/*

3021

-+	 * Loop through classes, and be careful to leave the scheduler

3022

-+	 * in a consistent state, as feedback mechanisms and vtime

3023

-+	 * updates cannot be disabled during the process.

3024

-+	 */

3025

-+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3026

-+		st = bfq_entity_service_tree(&bfqq->entity);

3027

-+

3028

-+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3029

-+		bfqq->max_budget = bfq_max_budget(bfqd);

3030

-+

3031

-+		bfq_forget_idle(st);

3032

-+	}

3033

-+

3034

-+	BUG_ON(bfqd->busy_queues != 0);

3035

-+

3036

-+	return dispatched;

3037

-+}

3038

-+

3039

-+static int bfq_dispatch_requests(struct request_queue *q, int force)

3040

-+{

3041

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3042

-+	struct bfq_queue *bfqq;

3043

-+	int max_dispatch;

3044

-+

3045

-+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3046

-+	if (bfqd->busy_queues == 0)

3047

-+		return 0;

3048

-+

3049

-+	if (unlikely(force))

3050

-+		return bfq_forced_dispatch(bfqd);

3051

-+

3052

-+	if((bfqq = bfq_select_queue(bfqd)) == NULL)

3053

-+		return 0;

3054

-+

3055

-+	max_dispatch = bfqd->bfq_quantum;

3056

-+	if (bfq_class_idle(bfqq))

3057

-+		max_dispatch = 1;

3058

-+

3059

-+	if (!bfq_bfqq_sync(bfqq))

3060

-+		max_dispatch = bfqd->bfq_max_budget_async_rq;

3061

-+

3062

-+	if (bfqq->dispatched >= max_dispatch) {

3063

-+		if (bfqd->busy_queues > 1)

3064

-+			return 0;

3065

-+		if (bfqq->dispatched >= 4 * max_dispatch)

3066

-+			return 0;

3067

-+	}

3068

-+

3069

-+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3070

-+		return 0;

3071

-+

3072

-+	bfq_clear_bfqq_wait_request(bfqq);

3073

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3074

-+

3075

-+	if (! bfq_dispatch_request(bfqd, bfqq))

3076

-+		return 0;

3077

-+

3078

-+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"

3079

-+		     "(max_disp %d)", bfqq->pid, max_dispatch);

3080

-+

3081

-+	return 1;

3082

-+}

3083

-+

3084

-+/*

3085

-+ * Task holds one reference to the queue, dropped when task exits.  Each rq

3086

-+ * in-flight on this queue also holds a reference, dropped when rq is freed.

3087

-+ *

3088

-+ * Queue lock must be held here.

3089

-+ */

3090

-+static void bfq_put_queue(struct bfq_queue *bfqq)

3091

-+{

3092

-+	struct bfq_data *bfqd = bfqq->bfqd;

3093

-+

3094

-+	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3095

-+

3096

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3097

-+		     atomic_read(&bfqq->ref));

3098

-+	if (!atomic_dec_and_test(&bfqq->ref))

3099

-+		return;

3100

-+

3101

-+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3102

-+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3103

-+	BUG_ON(bfqq->entity.tree != NULL);

3104

-+	BUG_ON(bfq_bfqq_busy(bfqq));

3105

-+	BUG_ON(bfqd->active_queue == bfqq);

3106

-+

3107

-+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3108

-+

3109

-+	kmem_cache_free(bfq_pool, bfqq);

3110

-+}

3111

-+

3112

-+static void bfq_put_cooperator(struct bfq_queue *bfqq)

3113

-+{

3114

-+	struct bfq_queue *__bfqq, *next;

3115

-+

3116

-+	/*

3117

-+	 * If this queue was scheduled to merge with another queue, be

3118

-+	 * sure to drop the reference taken on that queue (and others in

3119

-+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3120

-+	 */

3121

-+	__bfqq = bfqq->new_bfqq;

3122

-+	while (__bfqq) {

3123

-+		if (__bfqq == bfqq) {

3124

-+			WARN(1, "bfqq->new_bfqq loop detected.\n");

3125

-+			break;

3126

-+		}

3127

-+		next = __bfqq->new_bfqq;

3128

-+		bfq_put_queue(__bfqq);

3129

-+		__bfqq = next;

3130

-+	}

3131

-+}

3132

-+

3133

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3134

-+{

3135

-+	if (bfqq == bfqd->active_queue) {

3136

-+		__bfq_bfqq_expire(bfqd, bfqq);

3137

-+		bfq_schedule_dispatch(bfqd);

3138

-+	}

3139

-+

3140

-+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3141

-+		     atomic_read(&bfqq->ref));

3142

-+

3143

-+	bfq_put_cooperator(bfqq);

3144

-+

3145

-+	bfq_put_queue(bfqq);

3146

-+}

3147

-+

3148

-+static void bfq_init_icq(struct io_cq *icq)

3149

-+{

3150

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3151

-+

3152

-+	bic->ttime.last_end_request = jiffies;

3153

-+}

3154

-+

3155

-+static void bfq_exit_icq(struct io_cq *icq)

3156

-+{

3157

-+	struct bfq_io_cq *bic = icq_to_bic(icq);

3158

-+	struct bfq_data *bfqd = bic_to_bfqd(bic);

3159

-+

3160

-+	if (bic->bfqq[BLK_RW_ASYNC]) {

3161

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3162

-+		bic->bfqq[BLK_RW_ASYNC] = NULL;

3163

-+	}

3164

-+

3165

-+	if (bic->bfqq[BLK_RW_SYNC]) {

3166

-+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3167

-+		bic->bfqq[BLK_RW_SYNC] = NULL;

3168

-+	}

3169

-+}

3170

-+

3171

-+/*

3172

-+ * Update the entity prio values; note that the new values will not

3173

-+ * be used until the next (re)activation.

3174

-+ */

3175

-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3176

-+{

3177

-+	struct task_struct *tsk = current;

3178

-+	int ioprio_class;

3179

-+

3180

-+	if (!bfq_bfqq_prio_changed(bfqq))

3181

-+		return;

3182

-+

3183

-+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3184

-+	switch (ioprio_class) {

3185

-+	default:

3186

-+		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);

3187

-+	case IOPRIO_CLASS_NONE:

3188

-+		/*

3189

-+		 * No prio set, inherit CPU scheduling settings.

3190

-+		 */

3191

-+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3192

-+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3193

-+		break;

3194

-+	case IOPRIO_CLASS_RT:

3195

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3196

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3197

-+		break;

3198

-+	case IOPRIO_CLASS_BE:

3199

-+		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3200

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3201

-+		break;

3202

-+	case IOPRIO_CLASS_IDLE:

3203

-+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3204

-+		bfqq->entity.new_ioprio = 7;

3205

-+		bfq_clear_bfqq_idle_window(bfqq);

3206

-+		break;

3207

-+	}

3208

-+

3209

-+	bfqq->entity.ioprio_changed = 1;

3210

-+

3211

-+	/*

3212

-+	 * Keep track of original prio settings in case we have to temporarily

3213

-+	 * elevate the priority of this queue.

3214

-+	 */

3215

-+	bfqq->org_ioprio = bfqq->entity.new_ioprio;

3216

-+	bfq_clear_bfqq_prio_changed(bfqq);

3217

-+}

3218

-+

3219

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3220

-+{

3221

-+	struct bfq_data *bfqd;

3222

-+	struct bfq_queue *bfqq, *new_bfqq;

3223

-+	struct bfq_group *bfqg;

3224

-+	unsigned long uninitialized_var(flags);

3225

-+	int ioprio = bic->icq.ioc->ioprio;

3226

-+

3227

-+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

3228

-+	/*

3229

-+	 * This condition may trigger on a newly created bic, be sure to drop the

3230

-+	 * lock before returning.

3231

-+	 */

3232

-+	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3233

-+		goto out;

3234

-+

3235

-+	bfqq = bic->bfqq[BLK_RW_ASYNC];

3236

-+	if (bfqq != NULL) {

3237

-+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3238

-+				    sched_data);

3239

-+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3240

-+					 GFP_ATOMIC);

3241

-+		if (new_bfqq != NULL) {

3242

-+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3243

-+			bfq_log_bfqq(bfqd, bfqq,

3244

-+				     "changed_ioprio: bfqq %p %d",

3245

-+				     bfqq, atomic_read(&bfqq->ref));

3246

-+			bfq_put_queue(bfqq);

3247

-+		}

3248

-+	}

3249

-+

3250

-+	bfqq = bic->bfqq[BLK_RW_SYNC];

3251

-+	if (bfqq != NULL)

3252

-+		bfq_mark_bfqq_prio_changed(bfqq);

3253

-+

3254

-+	bic->ioprio = ioprio;

3255

-+

3256

-+out:

3257

-+	bfq_put_bfqd_unlock(bfqd, &flags);

3258

-+}

3259

-+

3260

-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3261

-+			  pid_t pid, int is_sync)

3262

-+{

3263

-+	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3264

-+	INIT_LIST_HEAD(&bfqq->fifo);

3265

-+

3266

-+	atomic_set(&bfqq->ref, 0);

3267

-+	bfqq->bfqd = bfqd;

3268

-+

3269

-+	bfq_mark_bfqq_prio_changed(bfqq);

3270

-+

3271

-+	if (is_sync) {

3272

-+		if (!bfq_class_idle(bfqq))

3273

-+			bfq_mark_bfqq_idle_window(bfqq);

3274

-+		bfq_mark_bfqq_sync(bfqq);

3275

-+	}

3276

-+

3277

-+	/* Tentative initial value to trade off between thr and lat */

3278

-+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3279

-+	bfqq->pid = pid;

3280

-+

3281

-+	bfqq->raising_coeff = 1;

3282

-+	bfqq->last_rais_start_finish = 0;

3283

-+	bfqq->soft_rt_next_start = -1;

3284

-+}

3285

-+

3286

-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

3287

-+					      struct bfq_group *bfqg,

3288

-+					      int is_sync,

3289

-+					      struct bfq_io_cq *bic,

3290

-+					      gfp_t gfp_mask)

3291

-+{

3292

-+	struct bfq_queue *bfqq, *new_bfqq = NULL;

3293

-+

3294

-+retry:

3295

-+	/* bic always exists here */

3296

-+	bfqq = bic_to_bfqq(bic, is_sync);

3297

-+

3298

-+	/*

3299

-+	 * Always try a new alloc if we fall back to the OOM bfqq

3300

-+	 * originally, since it should just be a temporary situation.

3301

-+	 */

3302

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3303

-+		bfqq = NULL;

3304

-+		if (new_bfqq != NULL) {

3305

-+			bfqq = new_bfqq;

3306

-+			new_bfqq = NULL;

3307

-+		} else if (gfp_mask & __GFP_WAIT) {

3308

-+			spin_unlock_irq(bfqd->queue->queue_lock);

3309

-+			new_bfqq = kmem_cache_alloc_node(bfq_pool,

3310

-+					gfp_mask | __GFP_ZERO,

3311

-+					bfqd->queue->node);

3312

-+			spin_lock_irq(bfqd->queue->queue_lock);

3313

-+			if (new_bfqq != NULL)

3314

-+				goto retry;

3315

-+		} else {

3316

-+			bfqq = kmem_cache_alloc_node(bfq_pool,

3317

-+					gfp_mask | __GFP_ZERO,

3318

-+					bfqd->queue->node);

3319

-+		}

3320

-+

3321

-+		if (bfqq != NULL) {

3322

-+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

3323

-+			bfq_log_bfqq(bfqd, bfqq, "allocated");

3324

-+		} else {

3325

-+			bfqq = &bfqd->oom_bfqq;

3326

-+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

3327

-+		}

3328

-+

3329

-+		bfq_init_prio_data(bfqq, bic);

3330

-+		bfq_init_entity(&bfqq->entity, bfqg);

3331

-+	}

3332

-+

3333

-+	if (new_bfqq != NULL)

3334

-+		kmem_cache_free(bfq_pool, new_bfqq);

3335

-+

3336

-+	return bfqq;

3337

-+}

3338

-+

3339

-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

3340

-+					       struct bfq_group *bfqg,

3341

-+					       int ioprio_class, int ioprio)

3342

-+{

3343

-+	switch (ioprio_class) {

3344

-+	case IOPRIO_CLASS_RT:

3345

-+		return &bfqg->async_bfqq[0][ioprio];

3346

-+	case IOPRIO_CLASS_NONE:

3347

-+		ioprio = IOPRIO_NORM;

3348

-+		/* fall through */

3349

-+	case IOPRIO_CLASS_BE:

3350

-+		return &bfqg->async_bfqq[1][ioprio];

3351

-+	case IOPRIO_CLASS_IDLE:

3352

-+		return &bfqg->async_idle_bfqq;

3353

-+	default:

3354

-+		BUG();

3355

-+	}

3356

-+}

3357

-+

3358

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

3359

-+				       struct bfq_group *bfqg, int is_sync,

3360

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask)

3361

-+{

3362

-+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3363

-+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3364

-+	struct bfq_queue **async_bfqq = NULL;

3365

-+	struct bfq_queue *bfqq = NULL;

3366

-+

3367

-+	if (!is_sync) {

3368

-+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

3369

-+						  ioprio);

3370

-+		bfqq = *async_bfqq;

3371

-+	}

3372

-+

3373

-+	if (bfqq == NULL)

3374

-+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3375

-+

3376

-+	/*

3377

-+	 * Pin the queue now that it's allocated, scheduler exit will prune it.

3378

-+	 */

3379

-+	if (!is_sync && *async_bfqq == NULL) {

3380

-+		atomic_inc(&bfqq->ref);

3381

-+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

3382

-+			     bfqq, atomic_read(&bfqq->ref));

3383

-+		*async_bfqq = bfqq;

3384

-+	}

3385

-+

3386

-+	atomic_inc(&bfqq->ref);

3387

-+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

3388

-+		     atomic_read(&bfqq->ref));

3389

-+	return bfqq;

3390

-+}

3391

-+

3392

-+static void bfq_update_io_thinktime(struct bfq_data *bfqd,

3393

-+				    struct bfq_io_cq *bic)

3394

-+{

3395

-+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

3396

-+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

3397

-+

3398

-+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

3399

-+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

3400

-+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;

3401

-+}

3402

-+

3403

-+static void bfq_update_io_seektime(struct bfq_data *bfqd,

3404

-+				   struct bfq_queue *bfqq,

3405

-+				   struct request *rq)

3406

-+{

3407

-+	sector_t sdist;

3408

-+	u64 total;

3409

-+

3410

-+	if (bfqq->last_request_pos < blk_rq_pos(rq))

3411

-+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

3412

-+	else

3413

-+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

3414

-+

3415

-+	/*

3416

-+	 * Don't allow the seek distance to get too large from the

3417

-+	 * odd fragment, pagein, etc.

3418

-+	 */

3419

-+	if (bfqq->seek_samples == 0) /* first request, not really a seek */

3420

-+		sdist = 0;

3421

-+	else if (bfqq->seek_samples <= 60) /* second & third seek */

3422

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

3423

-+	else

3424

-+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

3425

-+

3426

-+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

3427

-+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

3428

-+	total = bfqq->seek_total + (bfqq->seek_samples/2);

3429

-+	do_div(total, bfqq->seek_samples);

3430

-+	if (bfq_bfqq_coop(bfqq)) {

3431

-+		/*

3432

-+		 * If the mean seektime increases for a (non-seeky) shared

3433

-+		 * queue, some cooperator is likely to be idling too much.

3434

-+		 * On the contrary,  if it decreases, some cooperator has

3435

-+		 * probably waked up.

3436

-+		 *

3437

-+		 */

3438

-+		if ((sector_t)total < bfqq->seek_mean)

3439

-+			bfq_mark_bfqq_some_coop_idle(bfqq) ;

3440

-+		else if ((sector_t)total > bfqq->seek_mean)

3441

-+			bfq_clear_bfqq_some_coop_idle(bfqq) ;

3442

-+	}

3443

-+	bfqq->seek_mean = (sector_t)total;

3444

-+

3445

-+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

3446

-+			(u64)bfqq->seek_mean);

3447

-+}

3448

-+

3449

-+/*

3450

-+ * Disable idle window if the process thinks too long or seeks so much that

3451

-+ * it doesn't matter.

3452

-+ */

3453

-+static void bfq_update_idle_window(struct bfq_data *bfqd,

3454

-+				   struct bfq_queue *bfqq,

3455

-+				   struct bfq_io_cq *bic)

3456

-+{

3457

-+	int enable_idle;

3458

-+

3459

-+	/* Don't idle for async or idle io prio class. */

3460

-+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

3461

-+		return;

3462

-+

3463

-+	enable_idle = bfq_bfqq_idle_window(bfqq);

3464

-+

3465

-+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

3466

-+	    bfqd->bfq_slice_idle == 0 ||

3467

-+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

3468

-+			bfqq->raising_coeff == 1))

3469

-+		enable_idle = 0;

3470

-+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

3471

-+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

3472

-+			bfqq->raising_coeff == 1)

3473

-+			enable_idle = 0;

3474

-+		else

3475

-+			enable_idle = 1;

3476

-+	}

3477

-+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

3478

-+		enable_idle);

3479

-+

3480

-+	if (enable_idle)

3481

-+		bfq_mark_bfqq_idle_window(bfqq);

3482

-+	else

3483

-+		bfq_clear_bfqq_idle_window(bfqq);

3484

-+}

3485

-+

3486

-+/*

3487

-+ * Called when a new fs request (rq) is added to bfqq.  Check if there's

3488

-+ * something we should do about it.

3489

-+ */

3490

-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3491

-+			    struct request *rq)

3492

-+{

3493

-+	struct bfq_io_cq *bic = RQ_BIC(rq);

3494

-+

3495

-+	if (rq->cmd_flags & REQ_META)

3496

-+		bfqq->meta_pending++;

3497

-+

3498

-+	bfq_update_io_thinktime(bfqd, bic);

3499

-+	bfq_update_io_seektime(bfqd, bfqq, rq);

3500

-+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

3501

-+	    !BFQQ_SEEKY(bfqq))

3502

-+		bfq_update_idle_window(bfqd, bfqq, bic);

3503

-+

3504

-+	bfq_log_bfqq(bfqd, bfqq,

3505

-+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

3506

-+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

3507

-+		     (long long unsigned)bfqq->seek_mean);

3508

-+

3509

-+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3510

-+

3511

-+	if (bfqq == bfqd->active_queue) {

3512

-+		/*

3513

-+		 * If there is just this request queued and the request

3514

-+		 * is small, just exit.

3515

-+		 * In this way, if the disk is being idled to wait for a new

3516

-+		 * request from the active queue, we avoid unplugging the

3517

-+		 * device now.

3518

-+		 *

3519

-+		 * By doing so, we spare the disk to be committed

3520

-+		 * to serve just a small request. On the contrary, we wait for

3521

-+		 * the block layer to decide when to unplug the device:

3522

-+		 * hopefully, new requests will be merged to this

3523

-+		 * one quickly, then the device will be unplugged

3524

-+		 * and larger requests will be dispatched.

3525

-+		 */

3526

-+	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&

3527

-+		    blk_rq_sectors(rq) < 32) {

3528

-+		        return;

3529

-+		}

3530

-+		if (bfq_bfqq_wait_request(bfqq)) {

3531

-+			/*

3532

-+			 * If we are waiting for a request for this queue, let

3533

-+			 * it rip immediately and flag that we must not expire

3534

-+			 * this queue just now.

3535

-+			 */

3536

-+			bfq_clear_bfqq_wait_request(bfqq);

3537

-+			del_timer(&bfqd->idle_slice_timer);

3538

-+			/*

3539

-+			 * Here we can safely expire the queue, in

3540

-+			 * case of budget timeout, without wasting

3541

-+			 * guarantees

3542

-+			 */

3543

-+			if (bfq_bfqq_budget_timeout(bfqq))

3544

-+				bfq_bfqq_expire(bfqd, bfqq, 0,

3545

-+						BFQ_BFQQ_BUDGET_TIMEOUT);

3546

-+			__blk_run_queue(bfqd->queue);

3547

-+		}

3548

-+	}

3549

-+}

3550

-+

3551

-+static void bfq_insert_request(struct request_queue *q, struct request *rq)

3552

-+{

3553

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3554

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3555

-+

3556

-+	assert_spin_locked(bfqd->queue->queue_lock);

3557

-+	bfq_init_prio_data(bfqq, RQ_BIC(rq));

3558

-+

3559

-+	bfq_add_rq_rb(rq);

3560

-+

3561

-+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

3562

-+	list_add_tail(&rq->queuelist, &bfqq->fifo);

3563

-+

3564

-+	bfq_rq_enqueued(bfqd, bfqq, rq);

3565

-+}

3566

-+

3567

-+static void bfq_update_hw_tag(struct bfq_data *bfqd)

3568

-+{

3569

-+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

3570

-+				     bfqd->rq_in_driver);

3571

-+

3572

-+	if (bfqd->hw_tag == 1)

3573

-+		return;

3574

-+

3575

-+	/*

3576

-+	 * This sample is valid if the number of outstanding requests

3577

-+	 * is large enough to allow a queueing behavior.  Note that the

3578

-+	 * sum is not exact, as it's not taking into account deactivated

3579

-+	 * requests.

3580

-+	 */

3581

-+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

3582

-+		return;

3583

-+

3584

-+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

3585

-+		return;

3586

-+

3587

-+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

3588

-+	bfqd->max_rq_in_driver = 0;

3589

-+	bfqd->hw_tag_samples = 0;

3590

-+}

3591

-+

3592

-+static void bfq_completed_request(struct request_queue *q, struct request *rq)

3593

-+{

3594

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3595

-+	struct bfq_data *bfqd = bfqq->bfqd;

3596

-+	const int sync = rq_is_sync(rq);

3597

-+

3598

-+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

3599

-+			blk_rq_sectors(rq), sync);

3600

-+

3601

-+	bfq_update_hw_tag(bfqd);

3602

-+

3603

-+	WARN_ON(!bfqd->rq_in_driver);

3604

-+	WARN_ON(!bfqq->dispatched);

3605

-+	bfqd->rq_in_driver--;

3606

-+	bfqq->dispatched--;

3607

-+

3608

-+	if (bfq_bfqq_sync(bfqq))

3609

-+		bfqd->sync_flight--;

3610

-+

3611

-+	if (sync)

3612

-+		RQ_BIC(rq)->ttime.last_end_request = jiffies;

3613

-+

3614

-+	/*

3615

-+	 * If this is the active queue, check if it needs to be expired,

3616

-+	 * or if we want to idle in case it has no pending requests.

3617

-+	 */

3618

-+	if (bfqd->active_queue == bfqq) {

3619

-+		int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

3620

-+		if (bfq_bfqq_budget_new(bfqq))

3621

-+			bfq_set_budget_timeout(bfqd);

3622

-+

3623

-+		/* Idling is disabled also for cooperation issues:

3624

-+		 * 1) there is a close cooperator for the queue, or

3625

-+		 * 2) the queue is shared and some cooperator is likely

3626

-+		 *    to be idle (in this case, by not arming the idle timer,

3627

-+		 *    we try to slow down the queue, to prevent the zones

3628

-+		 *    of the disk accessed by the active cooperators to become

3629

-+		 *    too distant from the zone that will be accessed by the

3630

-+		 *    currently idle cooperators)

3631

-+		 */

3632

-+		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

3633

-+			bfq_arm_slice_timer(bfqd);

3634

-+		else if (budg_timeout)

3635

-+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

3636

-+	}

3637

-+

3638

-+	if (!bfqd->rq_in_driver)

3639

-+		bfq_schedule_dispatch(bfqd);

3640

-+}

3641

-+

3642

-+static inline int __bfq_may_queue(struct bfq_queue *bfqq)

3643

-+{

3644

-+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

3645

-+		bfq_clear_bfqq_must_alloc(bfqq);

3646

-+		return ELV_MQUEUE_MUST;

3647

-+	}

3648

-+

3649

-+	return ELV_MQUEUE_MAY;

3650

-+}

3651

-+

3652

-+static int bfq_may_queue(struct request_queue *q, int rw)

3653

-+{

3654

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3655

-+	struct task_struct *tsk = current;

3656

-+	struct bfq_io_cq *bic;

3657

-+	struct bfq_queue *bfqq;

3658

-+

3659

-+	/*

3660

-+	 * Don't force setup of a queue from here, as a call to may_queue

3661

-+	 * does not necessarily imply that a request actually will be queued.

3662

-+	 * So just lookup a possibly existing queue, or return 'may queue'

3663

-+	 * if that fails.

3664

-+	 */

3665

-+	bic = bfq_bic_lookup(bfqd, tsk->io_context);

3666

-+	if (bic == NULL)

3667

-+		return ELV_MQUEUE_MAY;

3668

-+

3669

-+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

3670

-+	if (bfqq != NULL) {

3671

-+		bfq_init_prio_data(bfqq, bic);

3672

-+

3673

-+		return __bfq_may_queue(bfqq);

3674

-+	}

3675

-+

3676

-+	return ELV_MQUEUE_MAY;

3677

-+}

3678

-+

3679

-+/*

3680

-+ * Queue lock held here.

3681

-+ */

3682

-+static void bfq_put_request(struct request *rq)

3683

-+{

3684

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq);

3685

-+

3686

-+	if (bfqq != NULL) {

3687

-+		const int rw = rq_data_dir(rq);

3688

-+

3689

-+		BUG_ON(!bfqq->allocated[rw]);

3690

-+		bfqq->allocated[rw]--;

3691

-+

3692

-+		rq->elv.priv[0] = NULL;

3693

-+		rq->elv.priv[1] = NULL;

3694

-+

3695

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

3696

-+			     bfqq, atomic_read(&bfqq->ref));

3697

-+		bfq_put_queue(bfqq);

3698

-+	}

3699

-+}

3700

-+

3701

-+static struct bfq_queue *

3702

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

3703

-+                struct bfq_queue *bfqq)

3704

-+{

3705

-+        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

3706

-+		(long unsigned)bfqq->new_bfqq->pid);

3707

-+        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

3708

-+        bfq_mark_bfqq_coop(bfqq->new_bfqq);

3709

-+        bfq_put_queue(bfqq);

3710

-+        return bic_to_bfqq(bic, 1);

3711

-+}

3712

-+

3713

-+/*

3714

-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

3715

-+ * was the last process referring to said bfqq.

3716

-+ */

3717

-+static struct bfq_queue *

3718

-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

3719

-+{

3720

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

3721

-+	if (bfqq_process_refs(bfqq) == 1) {

3722

-+		bfqq->pid = current->pid;

3723

-+		bfq_clear_bfqq_some_coop_idle(bfqq);

3724

-+		bfq_clear_bfqq_coop(bfqq);

3725

-+		bfq_clear_bfqq_split_coop(bfqq);

3726

-+		return bfqq;

3727

-+	}

3728

-+

3729

-+	bic_set_bfqq(bic, NULL, 1);

3730

-+

3731

-+	bfq_put_cooperator(bfqq);

3732

-+

3733

-+	bfq_put_queue(bfqq);

3734

-+	return NULL;

3735

-+}

3736

-+

3737

-+/*

3738

-+ * Allocate bfq data structures associated with this request.

3739

-+ */

3740

-+static int bfq_set_request(struct request_queue *q, struct request *rq,

3741

-+			   struct bio *bio, gfp_t gfp_mask)

3742

-+{

3743

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

3744

-+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

3745

-+	const int rw = rq_data_dir(rq);

3746

-+	const int is_sync = rq_is_sync(rq);

3747

-+	struct bfq_queue *bfqq;

3748

-+	struct bfq_group *bfqg;

3749

-+	unsigned long flags;

3750

-+

3751

-+	might_sleep_if(gfp_mask & __GFP_WAIT);

3752

-+

3753

-+	bfq_changed_ioprio(bic);

3754

-+

3755

-+	spin_lock_irqsave(q->queue_lock, flags);

3756

-+

3757

-+	if (bic == NULL)

3758

-+		goto queue_fail;

3759

-+

3760

-+	bfqg = bfq_bic_update_cgroup(bic);

3761

-+

3762

-+new_queue:

3763

-+	bfqq = bic_to_bfqq(bic, is_sync);

3764

-+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

3765

-+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

3766

-+		bic_set_bfqq(bic, bfqq, is_sync);

3767

-+	} else {

3768

-+		/*

3769

-+		 * If the queue was seeky for too long, break it apart.

3770

-+		 */

3771

-+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

3772

-+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

3773

-+			bfqq = bfq_split_bfqq(bic, bfqq);

3774

-+			if (!bfqq)

3775

-+				goto new_queue;

3776

-+		}

3777

-+

3778

-+		/*

3779

-+		 * Check to see if this queue is scheduled to merge with

3780

-+		 * another closely cooperating queue. The merging of queues

3781

-+		 * happens here as it must be done in process context.

3782

-+		 * The reference on new_bfqq was taken in merge_bfqqs.

3783

-+		 */

3784

-+		if (bfqq->new_bfqq != NULL)

3785

-+			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

3786

-+	}

3787

-+

3788

-+	bfqq->allocated[rw]++;

3789

-+	atomic_inc(&bfqq->ref);

3790

-+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

3791

-+		     atomic_read(&bfqq->ref));

3792

-+

3793

-+	rq->elv.priv[0] = bic;

3794

-+	rq->elv.priv[1] = bfqq;

3795

-+

3796

-+	spin_unlock_irqrestore(q->queue_lock, flags);

3797

-+

3798

-+	return 0;

3799

-+

3800

-+queue_fail:

3801

-+	bfq_schedule_dispatch(bfqd);

3802

-+	spin_unlock_irqrestore(q->queue_lock, flags);

3803

-+

3804

-+	return 1;

3805

-+}

3806

-+

3807

-+static void bfq_kick_queue(struct work_struct *work)

3808

-+{

3809

-+	struct bfq_data *bfqd =

3810

-+		container_of(work, struct bfq_data, unplug_work);

3811

-+	struct request_queue *q = bfqd->queue;

3812

-+

3813

-+	spin_lock_irq(q->queue_lock);

3814

-+	__blk_run_queue(q);

3815

-+	spin_unlock_irq(q->queue_lock);

3816

-+}

3817

-+

3818

-+/*

3819

-+ * Handler of the expiration of the timer running if the active_queue

3820

-+ * is idling inside its time slice.

3821

-+ */

3822

-+static void bfq_idle_slice_timer(unsigned long data)

3823

-+{

3824

-+	struct bfq_data *bfqd = (struct bfq_data *)data;

3825

-+	struct bfq_queue *bfqq;

3826

-+	unsigned long flags;

3827

-+	enum bfqq_expiration reason;

3828

-+

3829

-+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

3830

-+

3831

-+	bfqq = bfqd->active_queue;

3832

-+	/*

3833

-+	 * Theoretical race here: active_queue can be NULL or different

3834

-+	 * from the queue that was idling if the timer handler spins on

3835

-+	 * the queue_lock and a new request arrives for the current

3836

-+	 * queue and there is a full dispatch cycle that changes the

3837

-+	 * active_queue.  This can hardly happen, but in the worst case

3838

-+	 * we just expire a queue too early.

3839

-+	 */

3840

-+	if (bfqq != NULL) {

3841

-+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

3842

-+		if (bfq_bfqq_budget_timeout(bfqq))

3843

-+			/*

3844

-+			 * Also here the queue can be safely expired

3845

-+			 * for budget timeout without wasting

3846

-+			 * guarantees

3847

-+			 */

3848

-+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3849

-+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

3850

-+			/*

3851

-+			 * The queue may not be empty upon timer expiration,

3852

-+			 * because we may not disable the timer when the first

3853

-+			 * request of the active queue arrives during

3854

-+			 * disk idling

3855

-+			 */

3856

-+			reason = BFQ_BFQQ_TOO_IDLE;

3857

-+		else

3858

-+			goto schedule_dispatch;

3859

-+

3860

-+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

3861

-+	}

3862

-+

3863

-+schedule_dispatch:

3864

-+	bfq_schedule_dispatch(bfqd);

3865

-+

3866

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

3867

-+}

3868

-+

3869

-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

3870

-+{

3871

-+	del_timer_sync(&bfqd->idle_slice_timer);

3872

-+	cancel_work_sync(&bfqd->unplug_work);

3873

-+}

3874

-+

3875

-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

3876

-+					struct bfq_queue **bfqq_ptr)

3877

-+{

3878

-+	struct bfq_group *root_group = bfqd->root_group;

3879

-+	struct bfq_queue *bfqq = *bfqq_ptr;

3880

-+

3881

-+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

3882

-+	if (bfqq != NULL) {

3883

-+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

3884

-+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

3885

-+			     bfqq, atomic_read(&bfqq->ref));

3886

-+		bfq_put_queue(bfqq);

3887

-+		*bfqq_ptr = NULL;

3888

-+	}

3889

-+}

3890

-+

3891

-+/*

3892

-+ * Release all the bfqg references to its async queues.  If we are

3893

-+ * deallocating the group these queues may still contain requests, so

3894

-+ * we reparent them to the root cgroup (i.e., the only one that will

3895

-+ * exist for sure untill all the requests on a device are gone).

3896

-+ */

3897

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

3898

-+{

3899

-+	int i, j;

3900

-+

3901

-+	for (i = 0; i < 2; i++)

3902

-+		for (j = 0; j < IOPRIO_BE_NR; j++)

3903

-+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

3904

-+

3905

-+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

3906

-+}

3907

-+

3908

-+static void bfq_exit_queue(struct elevator_queue *e)

3909

-+{

3910

-+	struct bfq_data *bfqd = e->elevator_data;

3911

-+	struct request_queue *q = bfqd->queue;

3912

-+	struct bfq_queue *bfqq, *n;

3913

-+

3914

-+	bfq_shutdown_timer_wq(bfqd);

3915

-+

3916

-+	spin_lock_irq(q->queue_lock);

3917

-+

3918

-+	BUG_ON(bfqd->active_queue != NULL);

3919

-+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

3920

-+		bfq_deactivate_bfqq(bfqd, bfqq, 0);

3921

-+

3922

-+	bfq_disconnect_groups(bfqd);

3923

-+	spin_unlock_irq(q->queue_lock);

3924

-+

3925

-+	bfq_shutdown_timer_wq(bfqd);

3926

-+

3927

-+	synchronize_rcu();

3928

-+

3929

-+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3930

-+

3931

-+	bfq_free_root_group(bfqd);

3932

-+	kfree(bfqd);

3933

-+}

3934

-+

3935

-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

3936

-+{

3937

-+	struct bfq_group *bfqg;

3938

-+	struct bfq_data *bfqd;

3939

-+	struct elevator_queue *eq;

3940

-+

3941

-+	eq = elevator_alloc(q, e);

3942

-+	if (eq == NULL)

3943

-+		return -ENOMEM;

3944

-+

3945

-+	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3946

-+	if (bfqd == NULL) {

3947

-+		kobject_put(&eq->kobj);

3948

-+		return -ENOMEM;

3949

-+	}

3950

-+	eq->elevator_data = bfqd;

3951

-+

3952

-+	/*

3953

-+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

3954

-+	 * Grab a permanent reference to it, so that the normal code flow

3955

-+	 * will not attempt to free it.

3956

-+	 */

3957

-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

3958

-+	atomic_inc(&bfqd->oom_bfqq.ref);

3959

-+

3960

-+	bfqd->queue = q;

3961

-+

3962

-+	spin_lock_irq(q->queue_lock);

3963

-+	q->elevator = eq;

3964

-+	spin_unlock_irq(q->queue_lock);

3965

-+

3966

-+	bfqg = bfq_alloc_root_group(bfqd, q->node);

3967

-+	if (bfqg == NULL) {

3968

-+		kfree(bfqd);

3969

-+		kobject_put(&eq->kobj);

3970

-+		return -ENOMEM;

3971

-+	}

3972

-+

3973

-+	bfqd->root_group = bfqg;

3974

-+

3975

-+	init_timer(&bfqd->idle_slice_timer);

3976

-+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

3977

-+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

3978

-+

3979

-+	bfqd->rq_pos_tree = RB_ROOT;

3980

-+

3981

-+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

3982

-+

3983

-+	INIT_LIST_HEAD(&bfqd->active_list);

3984

-+	INIT_LIST_HEAD(&bfqd->idle_list);

3985

-+

3986

-+	bfqd->hw_tag = -1;

3987

-+

3988

-+	bfqd->bfq_max_budget = bfq_default_max_budget;

3989

-+

3990

-+	bfqd->bfq_quantum = bfq_quantum;

3991

-+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

3992

-+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

3993

-+	bfqd->bfq_back_max = bfq_back_max;

3994

-+	bfqd->bfq_back_penalty = bfq_back_penalty;

3995

-+	bfqd->bfq_slice_idle = bfq_slice_idle;

3996

-+	bfqd->bfq_class_idle_last_service = 0;

3997

-+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

3998

-+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

3999

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4000

-+

4001

-+	bfqd->low_latency = true;

4002

-+

4003

-+	bfqd->bfq_raising_coeff = 20;

4004

-+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

4005

-+	bfqd->bfq_raising_max_time = 0;

4006

-+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

4007

-+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

4008

-+	bfqd->bfq_raising_max_softrt_rate = 7000;

4009

-+

4010

-+	/* Initially estimate the device's peak rate as the reference rate */

4011

-+	if (blk_queue_nonrot(bfqd->queue)) {

4012

-+		bfqd->RT_prod = R_nonrot * T_nonrot;

4013

-+		bfqd->peak_rate = R_nonrot;

4014

-+	} else {

4015

-+		bfqd->RT_prod = R_rot * T_rot;

4016

-+		bfqd->peak_rate = R_rot;

4017

-+	}

4018

-+

4019

-+	return 0;

4020

-+}

4021

-+

4022

-+static void bfq_slab_kill(void)

4023

-+{

4024

-+	if (bfq_pool != NULL)

4025

-+		kmem_cache_destroy(bfq_pool);

4026

-+}

4027

-+

4028

-+static int __init bfq_slab_setup(void)

4029

-+{

4030

-+	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4031

-+	if (bfq_pool == NULL)

4032

-+		return -ENOMEM;

4033

-+	return 0;

4034

-+}

4035

-+

4036

-+static ssize_t bfq_var_show(unsigned int var, char *page)

4037

-+{

4038

-+	return sprintf(page, "%d\n", var);

4039

-+}

4040

-+

4041

-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

4042

-+{

4043

-+	unsigned long new_val;

4044

-+	int ret = strict_strtoul(page, 10, &new_val);

4045

-+

4046

-+	if (ret == 0)

4047

-+		*var = new_val;

4048

-+

4049

-+	return count;

4050

-+}

4051

-+

4052

-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

4053

-+{

4054

-+	struct bfq_data *bfqd = e->elevator_data;

4055

-+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

4056

-+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

4057

-+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

4058

-+}

4059

-+

4060

-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4061

-+{

4062

-+	struct bfq_queue *bfqq;

4063

-+	struct bfq_data *bfqd = e->elevator_data;

4064

-+	ssize_t num_char = 0;

4065

-+

4066

-+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4067

-+			    bfqd->queued);

4068

-+

4069

-+	spin_lock_irq(bfqd->queue->queue_lock);

4070

-+

4071

-+	num_char += sprintf(page + num_char, "Active:\n");

4072

-+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4073

-+	  num_char += sprintf(page + num_char,

4074

-+			      "pid%d: weight %hu, nr_queued %d %d,"

4075

-+			      " dur %d/%u\n",

4076

-+			      bfqq->pid,

4077

-+			      bfqq->entity.weight,

4078

-+			      bfqq->queued[0],

4079

-+			      bfqq->queued[1],

4080

-+			jiffies_to_msecs(jiffies -

4081

-+				bfqq->last_rais_start_finish),

4082

-+			jiffies_to_msecs(bfqq->raising_cur_max_time));

4083

-+	}

4084

-+

4085

-+	num_char += sprintf(page + num_char, "Idle:\n");

4086

-+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4087

-+			num_char += sprintf(page + num_char,

4088

-+				"pid%d: weight %hu, dur %d/%u\n",

4089

-+				bfqq->pid,

4090

-+				bfqq->entity.weight,

4091

-+				jiffies_to_msecs(jiffies -

4092

-+					bfqq->last_rais_start_finish),

4093

-+				jiffies_to_msecs(bfqq->raising_cur_max_time));

4094

-+	}

4095

-+

4096

-+	spin_unlock_irq(bfqd->queue->queue_lock);

4097

-+

4098

-+	return num_char;

4099

-+}

4100

-+

4101

-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4102

-+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4103

-+{									\

4104

-+	struct bfq_data *bfqd = e->elevator_data;			\

4105

-+	unsigned int __data = __VAR;					\

4106

-+	if (__CONV)							\

4107

-+		__data = jiffies_to_msecs(__data);			\

4108

-+	return bfq_var_show(__data, (page));				\

4109

-+}

4110

-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4111

-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4112

-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4113

-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4114

-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4115

-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4116

-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4117

-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

4118

-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4119

-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4120

-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4121

-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

4122

-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

4123

-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

4124

-+	1);

4125

-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

4126

-+	bfqd->bfq_raising_min_inter_arr_async,

4127

-+	1);

4128

-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

4129

-+	bfqd->bfq_raising_max_softrt_rate, 0);

4130

-+#undef SHOW_FUNCTION

4131

-+

4132

-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4133

-+static ssize_t								\

4134

-+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4135

-+{									\

4136

-+	struct bfq_data *bfqd = e->elevator_data;			\

4137

-+	unsigned long uninitialized_var(__data);			\

4138

-+	int ret = bfq_var_store(&__data, (page), count);		\

4139

-+	if (__data < (MIN))						\

4140

-+		__data = (MIN);						\

4141

-+	else if (__data > (MAX))					\

4142

-+		__data = (MAX);						\

4143

-+	if (__CONV)							\

4144

-+		*(__PTR) = msecs_to_jiffies(__data);			\

4145

-+	else								\

4146

-+		*(__PTR) = __data;					\

4147

-+	return ret;							\

4148

-+}

4149

-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4150

-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4151

-+		INT_MAX, 1);

4152

-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4153

-+		INT_MAX, 1);

4154

-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4155

-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4156

-+		INT_MAX, 0);

4157

-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4158

-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4159

-+		1, INT_MAX, 0);

4160

-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4161

-+		INT_MAX, 1);

4162

-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

4163

-+		INT_MAX, 0);

4164

-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

4165

-+		INT_MAX, 1);

4166

-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

4167

-+		INT_MAX, 1);

4168

-+STORE_FUNCTION(bfq_raising_min_idle_time_store,

4169

-+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

4170

-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

4171

-+		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

4172

-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

4173

-+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

4174

-+#undef STORE_FUNCTION

4175

-+

4176

-+/* do nothing for the moment */

4177

-+static ssize_t bfq_weights_store(struct elevator_queue *e,

4178

-+				    const char *page, size_t count)

4179

-+{

4180

-+	return count;

4181

-+}

4182

-+

4183

-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4184

-+{

4185

-+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4186

-+

4187

-+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4188

-+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4189

-+	else

4190

-+		return bfq_default_max_budget;

4191

-+}

4192

-+

4193

-+static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4194

-+				    const char *page, size_t count)

4195

-+{

4196

-+	struct bfq_data *bfqd = e->elevator_data;

4197

-+	unsigned long uninitialized_var(__data);

4198

-+	int ret = bfq_var_store(&__data, (page), count);

4199

-+

4200

-+	if (__data == 0)

4201

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4202

-+	else {

4203

-+		if (__data > INT_MAX)

4204

-+			__data = INT_MAX;

4205

-+		bfqd->bfq_max_budget = __data;

4206

-+	}

4207

-+

4208

-+	bfqd->bfq_user_max_budget = __data;

4209

-+

4210

-+	return ret;

4211

-+}

4212

-+

4213

-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4214

-+				      const char *page, size_t count)

4215

-+{

4216

-+	struct bfq_data *bfqd = e->elevator_data;

4217

-+	unsigned long uninitialized_var(__data);

4218

-+	int ret = bfq_var_store(&__data, (page), count);

4219

-+

4220

-+	if (__data < 1)

4221

-+		__data = 1;

4222

-+	else if (__data > INT_MAX)

4223

-+		__data = INT_MAX;

4224

-+

4225

-+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4226

-+	if (bfqd->bfq_user_max_budget == 0)

4227

-+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4228

-+

4229

-+	return ret;

4230

-+}

4231

-+

4232

-+static ssize_t bfq_low_latency_store(struct elevator_queue *e,

4233

-+				     const char *page, size_t count)

4234

-+{

4235

-+	struct bfq_data *bfqd = e->elevator_data;

4236

-+	unsigned long uninitialized_var(__data);

4237

-+	int ret = bfq_var_store(&__data, (page), count);

4238

-+

4239

-+	if (__data > 1)

4240

-+		__data = 1;

4241

-+	if (__data == 0 && bfqd->low_latency != 0)

4242

-+		bfq_end_raising(bfqd);

4243

-+	bfqd->low_latency = __data;

4244

-+

4245

-+	return ret;

4246

-+}

4247

-+

4248

-+#define BFQ_ATTR(name) \

4249

-+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

4250

-+

4251

-+static struct elv_fs_entry bfq_attrs[] = {

4252

-+	BFQ_ATTR(quantum),

4253

-+	BFQ_ATTR(fifo_expire_sync),

4254

-+	BFQ_ATTR(fifo_expire_async),

4255

-+	BFQ_ATTR(back_seek_max),

4256

-+	BFQ_ATTR(back_seek_penalty),

4257

-+	BFQ_ATTR(slice_idle),

4258

-+	BFQ_ATTR(max_budget),

4259

-+	BFQ_ATTR(max_budget_async_rq),

4260

-+	BFQ_ATTR(timeout_sync),

4261

-+	BFQ_ATTR(timeout_async),

4262

-+	BFQ_ATTR(low_latency),

4263

-+	BFQ_ATTR(raising_coeff),

4264

-+	BFQ_ATTR(raising_max_time),

4265

-+	BFQ_ATTR(raising_rt_max_time),

4266

-+	BFQ_ATTR(raising_min_idle_time),

4267

-+	BFQ_ATTR(raising_min_inter_arr_async),

4268

-+	BFQ_ATTR(raising_max_softrt_rate),

4269

-+	BFQ_ATTR(weights),

4270

-+	__ATTR_NULL

4271

-+};

4272

-+

4273

-+static struct elevator_type iosched_bfq = {

4274

-+	.ops = {

4275

-+		.elevator_merge_fn =		bfq_merge,

4276

-+		.elevator_merged_fn =		bfq_merged_request,

4277

-+		.elevator_merge_req_fn =	bfq_merged_requests,

4278

-+		.elevator_allow_merge_fn =	bfq_allow_merge,

4279

-+		.elevator_dispatch_fn =		bfq_dispatch_requests,

4280

-+		.elevator_add_req_fn =		bfq_insert_request,

4281

-+		.elevator_activate_req_fn =	bfq_activate_request,

4282

-+		.elevator_deactivate_req_fn =	bfq_deactivate_request,

4283

-+		.elevator_completed_req_fn =	bfq_completed_request,

4284

-+		.elevator_former_req_fn =	elv_rb_former_request,

4285

-+		.elevator_latter_req_fn =	elv_rb_latter_request,

4286

-+		.elevator_init_icq_fn =		bfq_init_icq,

4287

-+		.elevator_exit_icq_fn =		bfq_exit_icq,

4288

-+		.elevator_set_req_fn =		bfq_set_request,

4289

-+		.elevator_put_req_fn =		bfq_put_request,

4290

-+		.elevator_may_queue_fn =	bfq_may_queue,

4291

-+		.elevator_init_fn =		bfq_init_queue,

4292

-+		.elevator_exit_fn =		bfq_exit_queue,

4293

-+	},

4294

-+	.icq_size =		sizeof(struct bfq_io_cq),

4295

-+	.icq_align =		__alignof__(struct bfq_io_cq),

4296

-+	.elevator_attrs =	bfq_attrs,

4297

-+	.elevator_name =	"bfq",

4298

-+	.elevator_owner =	THIS_MODULE,

4299

-+};

4300

-+

4301

-+static int __init bfq_init(void)

4302

-+{

4303

-+	/*

4304

-+	 * Can be 0 on HZ < 1000 setups.

4305

-+	 */

4306

-+	if (bfq_slice_idle == 0)

4307

-+		bfq_slice_idle = 1;

4308

-+

4309

-+	if (bfq_timeout_async == 0)

4310

-+		bfq_timeout_async = 1;

4311

-+

4312

-+	if (bfq_slab_setup())

4313

-+		return -ENOMEM;

4314

-+

4315

-+	elv_register(&iosched_bfq);

4316

-+

4317

-+	return 0;

4318

-+}

4319

-+

4320

-+static void __exit bfq_exit(void)

4321

-+{

4322

-+	elv_unregister(&iosched_bfq);

4323

-+	bfq_slab_kill();

4324

-+}

4325

-+

4326

-+module_init(bfq_init);

4327

-+module_exit(bfq_exit);

4328

-+

4329

-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

4330

-+MODULE_LICENSE("GPL");

4331

-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

4332

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

4333

-new file mode 100644

4334

-index 0000000..03f8061

4335

---- /dev/null

4336

-+++ b/block/bfq-sched.c

4337

-@@ -0,0 +1,1072 @@

4338

-+/*

4339

-+ * BFQ: Hierarchical B-WF2Q+ scheduler.

4340

-+ *

4341

-+ * Based on ideas and code from CFQ:

4342

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

4343

-+ *

4344

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

4345

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

4346

-+ *

4347

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

4348

-+ */

4349

-+

4350

-+#ifdef CONFIG_CGROUP_BFQIO

4351

-+#define for_each_entity(entity)	\

4352

-+	for (; entity != NULL; entity = entity->parent)

4353

-+

4354

-+#define for_each_entity_safe(entity, parent) \

4355

-+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

4356

-+

4357

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

4358

-+						 int extract,

4359

-+						 struct bfq_data *bfqd);

4360

-+

4361

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4362

-+{

4363

-+	struct bfq_entity *bfqg_entity;

4364

-+	struct bfq_group *bfqg;

4365

-+	struct bfq_sched_data *group_sd;

4366

-+

4367

-+	BUG_ON(next_active == NULL);

4368

-+

4369

-+	group_sd = next_active->sched_data;

4370

-+

4371

-+	bfqg = container_of(group_sd, struct bfq_group, sched_data);

4372

-+	/*

4373

-+	 * bfq_group's my_entity field is not NULL only if the group

4374

-+	 * is not the root group. We must not touch the root entity

4375

-+	 * as it must never become an active entity.

4376

-+	 */

4377

-+	bfqg_entity = bfqg->my_entity;

4378

-+	if (bfqg_entity != NULL)

4379

-+		bfqg_entity->budget = next_active->budget;

4380

-+}

4381

-+

4382

-+static int bfq_update_next_active(struct bfq_sched_data *sd)

4383

-+{

4384

-+	struct bfq_entity *next_active;

4385

-+

4386

-+	if (sd->active_entity != NULL)

4387

-+		/* will update/requeue at the end of service */

4388

-+		return 0;

4389

-+

4390

-+	/*

4391

-+	 * NOTE: this can be improved in many ways, such as returning

4392

-+	 * 1 (and thus propagating upwards the update) only when the

4393

-+	 * budget changes, or caching the bfqq that will be scheduled

4394

-+	 * next from this subtree.  By now we worry more about

4395

-+	 * correctness than about performance...

4396

-+	 */

4397

-+	next_active = bfq_lookup_next_entity(sd, 0, NULL);

4398

-+	sd->next_active = next_active;

4399

-+

4400

-+	if (next_active != NULL)

4401

-+		bfq_update_budget(next_active);

4402

-+

4403

-+	return 1;

4404

-+}

4405

-+

4406

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4407

-+					 struct bfq_entity *entity)

4408

-+{

4409

-+	BUG_ON(sd->next_active != entity);

4410

-+}

4411

-+#else

4412

-+#define for_each_entity(entity)	\

4413

-+	for (; entity != NULL; entity = NULL)

4414

-+

4415

-+#define for_each_entity_safe(entity, parent) \

4416

-+	for (parent = NULL; entity != NULL; entity = parent)

4417

-+

4418

-+static inline int bfq_update_next_active(struct bfq_sched_data *sd)

4419

-+{

4420

-+	return 0;

4421

-+}

4422

-+

4423

-+static inline void bfq_check_next_active(struct bfq_sched_data *sd,

4424

-+					 struct bfq_entity *entity)

4425

-+{

4426

-+}

4427

-+

4428

-+static inline void bfq_update_budget(struct bfq_entity *next_active)

4429

-+{

4430

-+}

4431

-+#endif

4432

-+

4433

-+/*

4434

-+ * Shift for timestamp calculations.  This actually limits the maximum

4435

-+ * service allowed in one timestamp delta (small shift values increase it),

4436

-+ * the maximum total weight that can be used for the queues in the system

4437

-+ * (big shift values increase it), and the period of virtual time wraparounds.

4438

-+ */

4439

-+#define WFQ_SERVICE_SHIFT	22

4440

-+

4441

-+/**

4442

-+ * bfq_gt - compare two timestamps.

4443

-+ * @a: first ts.

4444

-+ * @b: second ts.

4445

-+ *

4446

-+ * Return @a > @b, dealing with wrapping correctly.

4447

-+ */

4448

-+static inline int bfq_gt(u64 a, u64 b)

4449

-+{

4450

-+	return (s64)(a - b) > 0;

4451

-+}

4452

-+

4453

-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

4454

-+{

4455

-+	struct bfq_queue *bfqq = NULL;

4456

-+

4457

-+	BUG_ON(entity == NULL);

4458

-+

4459

-+	if (entity->my_sched_data == NULL)

4460

-+		bfqq = container_of(entity, struct bfq_queue, entity);

4461

-+

4462

-+	return bfqq;

4463

-+}

4464

-+

4465

-+

4466

-+/**

4467

-+ * bfq_delta - map service into the virtual time domain.

4468

-+ * @service: amount of service.

4469

-+ * @weight: scale factor (weight of an entity or weight sum).

4470

-+ */

4471

-+static inline u64 bfq_delta(unsigned long service,

4472

-+					unsigned long weight)

4473

-+{

4474

-+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

4475

-+

4476

-+	do_div(d, weight);

4477

-+	return d;

4478

-+}

4479

-+

4480

-+/**

4481

-+ * bfq_calc_finish - assign the finish time to an entity.

4482

-+ * @entity: the entity to act upon.

4483

-+ * @service: the service to be charged to the entity.

4484

-+ */

4485

-+static inline void bfq_calc_finish(struct bfq_entity *entity,

4486

-+				   unsigned long service)

4487

-+{

4488

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4489

-+

4490

-+	BUG_ON(entity->weight == 0);

4491

-+

4492

-+	entity->finish = entity->start +

4493

-+		bfq_delta(service, entity->weight);

4494

-+

4495

-+	if (bfqq != NULL) {

4496

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4497

-+			"calc_finish: serv %lu, w %d",

4498

-+			service, entity->weight);

4499

-+		bfq_log_bfqq(bfqq->bfqd, bfqq,

4500

-+			"calc_finish: start %llu, finish %llu, delta %llu",

4501

-+			entity->start, entity->finish,

4502

-+			bfq_delta(service, entity->weight));

4503

-+	}

4504

-+}

4505

-+

4506

-+/**

4507

-+ * bfq_entity_of - get an entity from a node.

4508

-+ * @node: the node field of the entity.

4509

-+ *

4510

-+ * Convert a node pointer to the relative entity.  This is used only

4511

-+ * to simplify the logic of some functions and not as the generic

4512

-+ * conversion mechanism because, e.g., in the tree walking functions,

4513

-+ * the check for a %NULL value would be redundant.

4514

-+ */

4515

-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

4516

-+{

4517

-+	struct bfq_entity *entity = NULL;

4518

-+

4519

-+	if (node != NULL)

4520

-+		entity = rb_entry(node, struct bfq_entity, rb_node);

4521

-+

4522

-+	return entity;

4523

-+}

4524

-+

4525

-+/**

4526

-+ * bfq_extract - remove an entity from a tree.

4527

-+ * @root: the tree root.

4528

-+ * @entity: the entity to remove.

4529

-+ */

4530

-+static inline void bfq_extract(struct rb_root *root,

4531

-+			       struct bfq_entity *entity)

4532

-+{

4533

-+	BUG_ON(entity->tree != root);

4534

-+

4535

-+	entity->tree = NULL;

4536

-+	rb_erase(&entity->rb_node, root);

4537

-+}

4538

-+

4539

-+/**

4540

-+ * bfq_idle_extract - extract an entity from the idle tree.

4541

-+ * @st: the service tree of the owning @entity.

4542

-+ * @entity: the entity being removed.

4543

-+ */

4544

-+static void bfq_idle_extract(struct bfq_service_tree *st,

4545

-+			     struct bfq_entity *entity)

4546

-+{

4547

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4548

-+	struct rb_node *next;

4549

-+

4550

-+	BUG_ON(entity->tree != &st->idle);

4551

-+

4552

-+	if (entity == st->first_idle) {

4553

-+		next = rb_next(&entity->rb_node);

4554

-+		st->first_idle = bfq_entity_of(next);

4555

-+	}

4556

-+

4557

-+	if (entity == st->last_idle) {

4558

-+		next = rb_prev(&entity->rb_node);

4559

-+		st->last_idle = bfq_entity_of(next);

4560

-+	}

4561

-+

4562

-+	bfq_extract(&st->idle, entity);

4563

-+

4564

-+	if (bfqq != NULL)

4565

-+		list_del(&bfqq->bfqq_list);

4566

-+}

4567

-+

4568

-+/**

4569

-+ * bfq_insert - generic tree insertion.

4570

-+ * @root: tree root.

4571

-+ * @entity: entity to insert.

4572

-+ *

4573

-+ * This is used for the idle and the active tree, since they are both

4574

-+ * ordered by finish time.

4575

-+ */

4576

-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

4577

-+{

4578

-+	struct bfq_entity *entry;

4579

-+	struct rb_node **node = &root->rb_node;

4580

-+	struct rb_node *parent = NULL;

4581

-+

4582

-+	BUG_ON(entity->tree != NULL);

4583

-+

4584

-+	while (*node != NULL) {

4585

-+		parent = *node;

4586

-+		entry = rb_entry(parent, struct bfq_entity, rb_node);

4587

-+

4588

-+		if (bfq_gt(entry->finish, entity->finish))

4589

-+			node = &parent->rb_left;

4590

-+		else

4591

-+			node = &parent->rb_right;

4592

-+	}

4593

-+

4594

-+	rb_link_node(&entity->rb_node, parent, node);

4595

-+	rb_insert_color(&entity->rb_node, root);

4596

-+

4597

-+	entity->tree = root;

4598

-+}

4599

-+

4600

-+/**

4601

-+ * bfq_update_min - update the min_start field of a entity.

4602

-+ * @entity: the entity to update.

4603

-+ * @node: one of its children.

4604

-+ *

4605

-+ * This function is called when @entity may store an invalid value for

4606

-+ * min_start due to updates to the active tree.  The function  assumes

4607

-+ * that the subtree rooted at @node (which may be its left or its right

4608

-+ * child) has a valid min_start value.

4609

-+ */

4610

-+static inline void bfq_update_min(struct bfq_entity *entity,

4611

-+				  struct rb_node *node)

4612

-+{

4613

-+	struct bfq_entity *child;

4614

-+

4615

-+	if (node != NULL) {

4616

-+		child = rb_entry(node, struct bfq_entity, rb_node);

4617

-+		if (bfq_gt(entity->min_start, child->min_start))

4618

-+			entity->min_start = child->min_start;

4619

-+	}

4620

-+}

4621

-+

4622

-+/**

4623

-+ * bfq_update_active_node - recalculate min_start.

4624

-+ * @node: the node to update.

4625

-+ *

4626

-+ * @node may have changed position or one of its children may have moved,

4627

-+ * this function updates its min_start value.  The left and right subtrees

4628

-+ * are assumed to hold a correct min_start value.

4629

-+ */

4630

-+static inline void bfq_update_active_node(struct rb_node *node)

4631

-+{

4632

-+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

4633

-+

4634

-+	entity->min_start = entity->start;

4635

-+	bfq_update_min(entity, node->rb_right);

4636

-+	bfq_update_min(entity, node->rb_left);

4637

-+}

4638

-+

4639

-+/**

4640

-+ * bfq_update_active_tree - update min_start for the whole active tree.

4641

-+ * @node: the starting node.

4642

-+ *

4643

-+ * @node must be the deepest modified node after an update.  This function

4644

-+ * updates its min_start using the values held by its children, assuming

4645

-+ * that they did not change, and then updates all the nodes that may have

4646

-+ * changed in the path to the root.  The only nodes that may have changed

4647

-+ * are the ones in the path or their siblings.

4648

-+ */

4649

-+static void bfq_update_active_tree(struct rb_node *node)

4650

-+{

4651

-+	struct rb_node *parent;

4652

-+

4653

-+up:

4654

-+	bfq_update_active_node(node);

4655

-+

4656

-+	parent = rb_parent(node);

4657

-+	if (parent == NULL)

4658

-+		return;

4659

-+

4660

-+	if (node == parent->rb_left && parent->rb_right != NULL)

4661

-+		bfq_update_active_node(parent->rb_right);

4662

-+	else if (parent->rb_left != NULL)

4663

-+		bfq_update_active_node(parent->rb_left);

4664

-+

4665

-+	node = parent;

4666

-+	goto up;

4667

-+}

4668

-+

4669

-+/**

4670

-+ * bfq_active_insert - insert an entity in the active tree of its group/device.

4671

-+ * @st: the service tree of the entity.

4672

-+ * @entity: the entity being inserted.

4673

-+ *

4674

-+ * The active tree is ordered by finish time, but an extra key is kept

4675

-+ * per each node, containing the minimum value for the start times of

4676

-+ * its children (and the node itself), so it's possible to search for

4677

-+ * the eligible node with the lowest finish time in logarithmic time.

4678

-+ */

4679

-+static void bfq_active_insert(struct bfq_service_tree *st,

4680

-+			      struct bfq_entity *entity)

4681

-+{

4682

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4683

-+	struct rb_node *node = &entity->rb_node;

4684

-+

4685

-+	bfq_insert(&st->active, entity);

4686

-+

4687

-+	if (node->rb_left != NULL)

4688

-+		node = node->rb_left;

4689

-+	else if (node->rb_right != NULL)

4690

-+		node = node->rb_right;

4691

-+

4692

-+	bfq_update_active_tree(node);

4693

-+

4694

-+	if (bfqq != NULL)

4695

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

4696

-+}

4697

-+

4698

-+/**

4699

-+ * bfq_ioprio_to_weight - calc a weight from an ioprio.

4700

-+ * @ioprio: the ioprio value to convert.

4701

-+ */

4702

-+static unsigned short bfq_ioprio_to_weight(int ioprio)

4703

-+{

4704

-+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

4705

-+	return IOPRIO_BE_NR - ioprio;

4706

-+}

4707

-+

4708

-+/**

4709

-+ * bfq_weight_to_ioprio - calc an ioprio from a weight.

4710

-+ * @weight: the weight value to convert.

4711

-+ *

4712

-+ * To preserve as mush as possible the old only-ioprio user interface,

4713

-+ * 0 is used as an escape ioprio value for weights (numerically) equal or

4714

-+ * larger than IOPRIO_BE_NR

4715

-+ */

4716

-+static unsigned short bfq_weight_to_ioprio(int weight)

4717

-+{

4718

-+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

4719

-+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

4720

-+}

4721

-+

4722

-+static inline void bfq_get_entity(struct bfq_entity *entity)

4723

-+{

4724

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4725

-+	struct bfq_sched_data *sd;

4726

-+

4727

-+	if (bfqq != NULL) {

4728

-+		sd = entity->sched_data;

4729

-+		atomic_inc(&bfqq->ref);

4730

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

4731

-+			     bfqq, atomic_read(&bfqq->ref));

4732

-+	}

4733

-+}

4734

-+

4735

-+/**

4736

-+ * bfq_find_deepest - find the deepest node that an extraction can modify.

4737

-+ * @node: the node being removed.

4738

-+ *

4739

-+ * Do the first step of an extraction in an rb tree, looking for the

4740

-+ * node that will replace @node, and returning the deepest node that

4741

-+ * the following modifications to the tree can touch.  If @node is the

4742

-+ * last node in the tree return %NULL.

4743

-+ */

4744

-+static struct rb_node *bfq_find_deepest(struct rb_node *node)

4745

-+{

4746

-+	struct rb_node *deepest;

4747

-+

4748

-+	if (node->rb_right == NULL && node->rb_left == NULL)

4749

-+		deepest = rb_parent(node);

4750

-+	else if (node->rb_right == NULL)

4751

-+		deepest = node->rb_left;

4752

-+	else if (node->rb_left == NULL)

4753

-+		deepest = node->rb_right;

4754

-+	else {

4755

-+		deepest = rb_next(node);

4756

-+		if (deepest->rb_right != NULL)

4757

-+			deepest = deepest->rb_right;

4758

-+		else if (rb_parent(deepest) != node)

4759

-+			deepest = rb_parent(deepest);

4760

-+	}

4761

-+

4762

-+	return deepest;

4763

-+}

4764

-+

4765

-+/**

4766

-+ * bfq_active_extract - remove an entity from the active tree.

4767

-+ * @st: the service_tree containing the tree.

4768

-+ * @entity: the entity being removed.

4769

-+ */

4770

-+static void bfq_active_extract(struct bfq_service_tree *st,

4771

-+			       struct bfq_entity *entity)

4772

-+{

4773

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4774

-+	struct rb_node *node;

4775

-+

4776

-+	node = bfq_find_deepest(&entity->rb_node);

4777

-+	bfq_extract(&st->active, entity);

4778

-+

4779

-+	if (node != NULL)

4780

-+		bfq_update_active_tree(node);

4781

-+

4782

-+	if (bfqq != NULL)

4783

-+		list_del(&bfqq->bfqq_list);

4784

-+}

4785

-+

4786

-+/**

4787

-+ * bfq_idle_insert - insert an entity into the idle tree.

4788

-+ * @st: the service tree containing the tree.

4789

-+ * @entity: the entity to insert.

4790

-+ */

4791

-+static void bfq_idle_insert(struct bfq_service_tree *st,

4792

-+			    struct bfq_entity *entity)

4793

-+{

4794

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4795

-+	struct bfq_entity *first_idle = st->first_idle;

4796

-+	struct bfq_entity *last_idle = st->last_idle;

4797

-+

4798

-+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

4799

-+		st->first_idle = entity;

4800

-+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

4801

-+		st->last_idle = entity;

4802

-+

4803

-+	bfq_insert(&st->idle, entity);

4804

-+

4805

-+	if (bfqq != NULL)

4806

-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

4807

-+}

4808

-+

4809

-+/**

4810

-+ * bfq_forget_entity - remove an entity from the wfq trees.

4811

-+ * @st: the service tree.

4812

-+ * @entity: the entity being removed.

4813

-+ *

4814

-+ * Update the device status and forget everything about @entity, putting

4815

-+ * the device reference to it, if it is a queue.  Entities belonging to

4816

-+ * groups are not refcounted.

4817

-+ */

4818

-+static void bfq_forget_entity(struct bfq_service_tree *st,

4819

-+			      struct bfq_entity *entity)

4820

-+{

4821

-+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4822

-+	struct bfq_sched_data *sd;

4823

-+

4824

-+	BUG_ON(!entity->on_st);

4825

-+

4826

-+	entity->on_st = 0;

4827

-+	st->wsum -= entity->weight;

4828

-+	if (bfqq != NULL) {

4829

-+		sd = entity->sched_data;

4830

-+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

4831

-+			     bfqq, atomic_read(&bfqq->ref));

4832

-+		bfq_put_queue(bfqq);

4833

-+	}

4834

-+}

4835

-+

4836

-+/**

4837

-+ * bfq_put_idle_entity - release the idle tree ref of an entity.

4838

-+ * @st: service tree for the entity.

4839

-+ * @entity: the entity being released.

4840

-+ */

4841

-+static void bfq_put_idle_entity(struct bfq_service_tree *st,

4842

-+				struct bfq_entity *entity)

4843

-+{

4844

-+	bfq_idle_extract(st, entity);

4845

-+	bfq_forget_entity(st, entity);

4846

-+}

4847

-+

4848

-+/**

4849

-+ * bfq_forget_idle - update the idle tree if necessary.

4850

-+ * @st: the service tree to act upon.

4851

-+ *

4852

-+ * To preserve the global O(log N) complexity we only remove one entry here;

4853

-+ * as the idle tree will not grow indefinitely this can be done safely.

4854

-+ */

4855

-+static void bfq_forget_idle(struct bfq_service_tree *st)

4856

-+{

4857

-+	struct bfq_entity *first_idle = st->first_idle;

4858

-+	struct bfq_entity *last_idle = st->last_idle;

4859

-+

4860

-+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

4861

-+	    !bfq_gt(last_idle->finish, st->vtime)) {

4862

-+		/*

4863

-+		 * Forget the whole idle tree, increasing the vtime past

4864

-+		 * the last finish time of idle entities.

4865

-+		 */

4866

-+		st->vtime = last_idle->finish;

4867

-+	}

4868

-+

4869

-+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

4870

-+		bfq_put_idle_entity(st, first_idle);

4871

-+}

4872

-+

4873

-+static struct bfq_service_tree *

4874

-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

4875

-+			 struct bfq_entity *entity)

4876

-+{

4877

-+	struct bfq_service_tree *new_st = old_st;

4878

-+

4879

-+	if (entity->ioprio_changed) {

4880

-+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

4881

-+

4882

-+		BUG_ON(old_st->wsum < entity->weight);

4883

-+		old_st->wsum -= entity->weight;

4884

-+

4885

-+		if (entity->new_weight != entity->orig_weight) {

4886

-+			entity->orig_weight = entity->new_weight;

4887

-+			entity->ioprio =

4888

-+				bfq_weight_to_ioprio(entity->orig_weight);

4889

-+		} else if (entity->new_ioprio != entity->ioprio) {

4890

-+			entity->ioprio = entity->new_ioprio;

4891

-+			entity->orig_weight =

4892

-+					bfq_ioprio_to_weight(entity->ioprio);

4893

-+		} else

4894

-+			entity->new_weight = entity->orig_weight =

4895

-+				bfq_ioprio_to_weight(entity->ioprio);

4896

-+

4897

-+		entity->ioprio_class = entity->new_ioprio_class;

4898

-+		entity->ioprio_changed = 0;

4899

-+

4900

-+		/*

4901

-+		 * NOTE: here we may be changing the weight too early,

4902

-+		 * this will cause unfairness.  The correct approach

4903

-+		 * would have required additional complexity to defer

4904

-+		 * weight changes to the proper time instants (i.e.,

4905

-+		 * when entity->finish <= old_st->vtime).

4906

-+		 */

4907

-+		new_st = bfq_entity_service_tree(entity);

4908

-+		entity->weight = entity->orig_weight *

4909

-+			(bfqq != NULL ? bfqq->raising_coeff : 1);

4910

-+		new_st->wsum += entity->weight;

4911

-+

4912

-+		if (new_st != old_st)

4913

-+			entity->start = new_st->vtime;

4914

-+	}

4915

-+

4916

-+	return new_st;

4917

-+}

4918

-+

4919

-+/**

4920

-+ * bfq_bfqq_served - update the scheduler status after selection for service.

4921

-+ * @bfqq: the queue being served.

4922

-+ * @served: bytes to transfer.

4923

-+ *

4924

-+ * NOTE: this can be optimized, as the timestamps of upper level entities

4925

-+ * are synchronized every time a new bfqq is selected for service.  By now,

4926

-+ * we keep it to better check consistency.

4927

-+ */

4928

-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

4929

-+{

4930

-+	struct bfq_entity *entity = &bfqq->entity;

4931

-+	struct bfq_service_tree *st;

4932

-+

4933

-+	for_each_entity(entity) {

4934

-+		st = bfq_entity_service_tree(entity);

4935

-+

4936

-+		entity->service += served;

4937

-+		BUG_ON(entity->service > entity->budget);

4938

-+		BUG_ON(st->wsum == 0);

4939

-+

4940

-+		st->vtime += bfq_delta(served, st->wsum);

4941

-+		bfq_forget_idle(st);

4942

-+	}

4943

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

4944

-+}

4945

-+

4946

-+/**

4947

-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

4948

-+ * @bfqq: the queue that needs a service update.

4949

-+ *

4950

-+ * When it's not possible to be fair in the service domain, because

4951

-+ * a queue is not consuming its budget fast enough (the meaning of

4952

-+ * fast depends on the timeout parameter), we charge it a full

4953

-+ * budget.  In this way we should obtain a sort of time-domain

4954

-+ * fairness among all the seeky/slow queues.

4955

-+ */

4956

-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

4957

-+{

4958

-+	struct bfq_entity *entity = &bfqq->entity;

4959

-+

4960

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

4961

-+

4962

-+	bfq_bfqq_served(bfqq, entity->budget - entity->service);

4963

-+}

4964

-+

4965

-+/**

4966

-+ * __bfq_activate_entity - activate an entity.

4967

-+ * @entity: the entity being activated.

4968

-+ *

4969

-+ * Called whenever an entity is activated, i.e., it is not active and one

4970

-+ * of its children receives a new request, or has to be reactivated due to

4971

-+ * budget exhaustion.  It uses the current budget of the entity (and the

4972

-+ * service received if @entity is active) of the queue to calculate its

4973

-+ * timestamps.

4974

-+ */

4975

-+static void __bfq_activate_entity(struct bfq_entity *entity)

4976

-+{

4977

-+	struct bfq_sched_data *sd = entity->sched_data;

4978

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

4979

-+

4980

-+	if (entity == sd->active_entity) {

4981

-+		BUG_ON(entity->tree != NULL);

4982

-+		/*

4983

-+		 * If we are requeueing the current entity we have

4984

-+		 * to take care of not charging to it service it has

4985

-+		 * not received.

4986

-+		 */

4987

-+		bfq_calc_finish(entity, entity->service);

4988

-+		entity->start = entity->finish;

4989

-+		sd->active_entity = NULL;

4990

-+	} else if (entity->tree == &st->active) {

4991

-+		/*

4992

-+		 * Requeueing an entity due to a change of some

4993

-+		 * next_active entity below it.  We reuse the old

4994

-+		 * start time.

4995

-+		 */

4996

-+		bfq_active_extract(st, entity);

4997

-+	} else if (entity->tree == &st->idle) {

4998

-+		/*

4999

-+		 * Must be on the idle tree, bfq_idle_extract() will

5000

-+		 * check for that.

5001

-+		 */

5002

-+		bfq_idle_extract(st, entity);

5003

-+		entity->start = bfq_gt(st->vtime, entity->finish) ?

5004

-+				       st->vtime : entity->finish;

5005

-+	} else {

5006

-+		/*

5007

-+		 * The finish time of the entity may be invalid, and

5008

-+		 * it is in the past for sure, otherwise the queue

5009

-+		 * would have been on the idle tree.

5010

-+		 */

5011

-+		entity->start = st->vtime;

5012

-+		st->wsum += entity->weight;

5013

-+		bfq_get_entity(entity);

5014

-+

5015

-+		BUG_ON(entity->on_st);

5016

-+		entity->on_st = 1;

5017

-+	}

5018

-+

5019

-+	st = __bfq_entity_update_weight_prio(st, entity);

5020

-+	bfq_calc_finish(entity, entity->budget);

5021

-+	bfq_active_insert(st, entity);

5022

-+}

5023

-+

5024

-+/**

5025

-+ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5026

-+ * @entity: the entity to activate.

5027

-+ *

5028

-+ * Activate @entity and all the entities on the path from it to the root.

5029

-+ */

5030

-+static void bfq_activate_entity(struct bfq_entity *entity)

5031

-+{

5032

-+	struct bfq_sched_data *sd;

5033

-+

5034

-+	for_each_entity(entity) {

5035

-+		__bfq_activate_entity(entity);

5036

-+

5037

-+		sd = entity->sched_data;

5038

-+		if (!bfq_update_next_active(sd))

5039

-+			/*

5040

-+			 * No need to propagate the activation to the

5041

-+			 * upper entities, as they will be updated when

5042

-+			 * the active entity is rescheduled.

5043

-+			 */

5044

-+			break;

5045

-+	}

5046

-+}

5047

-+

5048

-+/**

5049

-+ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5050

-+ * @entity: the entity to deactivate.

5051

-+ * @requeue: if false, the entity will not be put into the idle tree.

5052

-+ *

5053

-+ * Deactivate an entity, independently from its previous state.  If the

5054

-+ * entity was not on a service tree just return, otherwise if it is on

5055

-+ * any scheduler tree, extract it from that tree, and if necessary

5056

-+ * and if the caller did not specify @requeue, put it on the idle tree.

5057

-+ *

5058

-+ * Return %1 if the caller should update the entity hierarchy, i.e.,

5059

-+ * if the entity was under service or if it was the next_active for

5060

-+ * its sched_data; return %0 otherwise.

5061

-+ */

5062

-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5063

-+{

5064

-+	struct bfq_sched_data *sd = entity->sched_data;

5065

-+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5066

-+	int was_active = entity == sd->active_entity;

5067

-+	int ret = 0;

5068

-+

5069

-+	if (!entity->on_st)

5070

-+		return 0;

5071

-+

5072

-+	BUG_ON(was_active && entity->tree != NULL);

5073

-+

5074

-+	if (was_active) {

5075

-+		bfq_calc_finish(entity, entity->service);

5076

-+		sd->active_entity = NULL;

5077

-+	} else if (entity->tree == &st->active)

5078

-+		bfq_active_extract(st, entity);

5079

-+	else if (entity->tree == &st->idle)

5080

-+		bfq_idle_extract(st, entity);

5081

-+	else if (entity->tree != NULL)

5082

-+		BUG();

5083

-+

5084

-+	if (was_active || sd->next_active == entity)

5085

-+		ret = bfq_update_next_active(sd);

5086

-+

5087

-+	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5088

-+		bfq_forget_entity(st, entity);

5089

-+	else

5090

-+		bfq_idle_insert(st, entity);

5091

-+

5092

-+	BUG_ON(sd->active_entity == entity);

5093

-+	BUG_ON(sd->next_active == entity);

5094

-+

5095

-+	return ret;

5096

-+}

5097

-+

5098

-+/**

5099

-+ * bfq_deactivate_entity - deactivate an entity.

5100

-+ * @entity: the entity to deactivate.

5101

-+ * @requeue: true if the entity can be put on the idle tree

5102

-+ */

5103

-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5104

-+{

5105

-+	struct bfq_sched_data *sd;

5106

-+	struct bfq_entity *parent;

5107

-+

5108

-+	for_each_entity_safe(entity, parent) {

5109

-+		sd = entity->sched_data;

5110

-+

5111

-+		if (!__bfq_deactivate_entity(entity, requeue))

5112

-+			/*

5113

-+			 * The parent entity is still backlogged, and

5114

-+			 * we don't need to update it as it is still

5115

-+			 * under service.

5116

-+			 */

5117

-+			break;

5118

-+

5119

-+		if (sd->next_active != NULL)

5120

-+			/*

5121

-+			 * The parent entity is still backlogged and

5122

-+			 * the budgets on the path towards the root

5123

-+			 * need to be updated.

5124

-+			 */

5125

-+			goto update;

5126

-+

5127

-+		/*

5128

-+		 * If we reach there the parent is no more backlogged and

5129

-+		 * we want to propagate the dequeue upwards.

5130

-+		 */

5131

-+		requeue = 1;

5132

-+	}

5133

-+

5134

-+	return;

5135

-+

5136

-+update:

5137

-+	entity = parent;

5138

-+	for_each_entity(entity) {

5139

-+		__bfq_activate_entity(entity);

5140

-+

5141

-+		sd = entity->sched_data;

5142

-+		if (!bfq_update_next_active(sd))

5143

-+			break;

5144

-+	}

5145

-+}

5146

-+

5147

-+/**

5148

-+ * bfq_update_vtime - update vtime if necessary.

5149

-+ * @st: the service tree to act upon.

5150

-+ *

5151

-+ * If necessary update the service tree vtime to have at least one

5152

-+ * eligible entity, skipping to its start time.  Assumes that the

5153

-+ * active tree of the device is not empty.

5154

-+ *

5155

-+ * NOTE: this hierarchical implementation updates vtimes quite often,

5156

-+ * we may end up with reactivated tasks getting timestamps after a

5157

-+ * vtime skip done because we needed a ->first_active entity on some

5158

-+ * intermediate node.

5159

-+ */

5160

-+static void bfq_update_vtime(struct bfq_service_tree *st)

5161

-+{

5162

-+	struct bfq_entity *entry;

5163

-+	struct rb_node *node = st->active.rb_node;

5164

-+

5165

-+	entry = rb_entry(node, struct bfq_entity, rb_node);

5166

-+	if (bfq_gt(entry->min_start, st->vtime)) {

5167

-+		st->vtime = entry->min_start;

5168

-+		bfq_forget_idle(st);

5169

-+	}

5170

-+}

5171

-+

5172

-+/**

5173

-+ * bfq_first_active - find the eligible entity with the smallest finish time

5174

-+ * @st: the service tree to select from.

5175

-+ *

5176

-+ * This function searches the first schedulable entity, starting from the

5177

-+ * root of the tree and going on the left every time on this side there is

5178

-+ * a subtree with at least one eligible (start >= vtime) entity.  The path

5179

-+ * on the right is followed only if a) the left subtree contains no eligible

5180

-+ * entities and b) no eligible entity has been found yet.

5181

-+ */

5182

-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

5183

-+{

5184

-+	struct bfq_entity *entry, *first = NULL;

5185

-+	struct rb_node *node = st->active.rb_node;

5186

-+

5187

-+	while (node != NULL) {

5188

-+		entry = rb_entry(node, struct bfq_entity, rb_node);

5189

-+left:

5190

-+		if (!bfq_gt(entry->start, st->vtime))

5191

-+			first = entry;

5192

-+

5193

-+		BUG_ON(bfq_gt(entry->min_start, st->vtime));

5194

-+

5195

-+		if (node->rb_left != NULL) {

5196

-+			entry = rb_entry(node->rb_left,

5197

-+					 struct bfq_entity, rb_node);

5198

-+			if (!bfq_gt(entry->min_start, st->vtime)) {

5199

-+				node = node->rb_left;

5200

-+				goto left;

5201

-+			}

5202

-+		}

5203

-+		if (first != NULL)

5204

-+			break;

5205

-+		node = node->rb_right;

5206

-+	}

5207

-+

5208

-+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

5209

-+	return first;

5210

-+}

5211

-+

5212

-+/**

5213

-+ * __bfq_lookup_next_entity - return the first eligible entity in @st.

5214

-+ * @st: the service tree.

5215

-+ *

5216

-+ * Update the virtual time in @st and return the first eligible entity

5217

-+ * it contains.

5218

-+ */

5219

-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

5220

-+						   bool force)

5221

-+{

5222

-+	struct bfq_entity *entity, *new_next_active = NULL;

5223

-+

5224

-+	if (RB_EMPTY_ROOT(&st->active))

5225

-+		return NULL;

5226

-+

5227

-+	bfq_update_vtime(st);

5228

-+	entity = bfq_first_active_entity(st);

5229

-+	BUG_ON(bfq_gt(entity->start, st->vtime));

5230

-+

5231

-+	/*

5232

-+	 * If the chosen entity does not match with the sched_data's

5233

-+	 * next_active and we are forcedly serving the IDLE priority

5234

-+	 * class tree, bubble up budget update.

5235

-+	 */

5236

-+	if (unlikely(force && entity != entity->sched_data->next_active)) {

5237

-+		new_next_active = entity;

5238

-+		for_each_entity(new_next_active)

5239

-+			bfq_update_budget(new_next_active);

5240

-+	}

5241

-+

5242

-+	return entity;

5243

-+}

5244

-+

5245

-+/**

5246

-+ * bfq_lookup_next_entity - return the first eligible entity in @sd.

5247

-+ * @sd: the sched_data.

5248

-+ * @extract: if true the returned entity will be also extracted from @sd.

5249

-+ *

5250

-+ * NOTE: since we cache the next_active entity at each level of the

5251

-+ * hierarchy, the complexity of the lookup can be decreased with

5252

-+ * absolutely no effort just returning the cached next_active value;

5253

-+ * we prefer to do full lookups to test the consistency of * the data

5254

-+ * structures.

5255

-+ */

5256

-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5257

-+						 int extract,

5258

-+						 struct bfq_data *bfqd)

5259

-+{

5260

-+	struct bfq_service_tree *st = sd->service_tree;

5261

-+	struct bfq_entity *entity;

5262

-+	int i=0;

5263

-+

5264

-+	BUG_ON(sd->active_entity != NULL);

5265

-+

5266

-+	if (bfqd != NULL &&

5267

-+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

5268

-+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);

5269

-+		if (entity != NULL) {

5270

-+			i = BFQ_IOPRIO_CLASSES - 1;

5271

-+			bfqd->bfq_class_idle_last_service = jiffies;

5272

-+			sd->next_active = entity;

5273

-+		}

5274

-+	}

5275

-+	for (; i < BFQ_IOPRIO_CLASSES; i++) {

5276

-+		entity = __bfq_lookup_next_entity(st + i, false);

5277

-+		if (entity != NULL) {

5278

-+			if (extract) {

5279

-+				bfq_check_next_active(sd, entity);

5280

-+				bfq_active_extract(st + i, entity);

5281

-+				sd->active_entity = entity;

5282

-+				sd->next_active = NULL;

5283

-+			}

5284

-+			break;

5285

-+		}

5286

-+	}

5287

-+

5288

-+	return entity;

5289

-+}

5290

-+

5291

-+/*

5292

-+ * Get next queue for service.

5293

-+ */

5294

-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

5295

-+{

5296

-+	struct bfq_entity *entity = NULL;

5297

-+	struct bfq_sched_data *sd;

5298

-+	struct bfq_queue *bfqq;

5299

-+

5300

-+	BUG_ON(bfqd->active_queue != NULL);

5301

-+

5302

-+	if (bfqd->busy_queues == 0)

5303

-+		return NULL;

5304

-+

5305

-+	sd = &bfqd->root_group->sched_data;

5306

-+	for (; sd != NULL; sd = entity->my_sched_data) {

5307

-+		entity = bfq_lookup_next_entity(sd, 1, bfqd);

5308

-+		BUG_ON(entity == NULL);

5309

-+		entity->service = 0;

5310

-+	}

5311

-+

5312

-+	bfqq = bfq_entity_to_bfqq(entity);

5313

-+	BUG_ON(bfqq == NULL);

5314

-+

5315

-+	return bfqq;

5316

-+}

5317

-+

5318

-+/*

5319

-+ * Forced extraction of the given queue.

5320

-+ */

5321

-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

5322

-+				      struct bfq_queue *bfqq)

5323

-+{

5324

-+	struct bfq_entity *entity;

5325

-+	struct bfq_sched_data *sd;

5326

-+

5327

-+	BUG_ON(bfqd->active_queue != NULL);

5328

-+

5329

-+	entity = &bfqq->entity;

5330

-+	/*

5331

-+	 * Bubble up extraction/update from the leaf to the root.

5332

-+	*/

5333

-+	for_each_entity(entity) {

5334

-+		sd = entity->sched_data;

5335

-+		bfq_update_budget(entity);

5336

-+		bfq_update_vtime(bfq_entity_service_tree(entity));

5337

-+		bfq_active_extract(bfq_entity_service_tree(entity), entity);

5338

-+		sd->active_entity = entity;

5339

-+		sd->next_active = NULL;

5340

-+		entity->service = 0;

5341

-+	}

5342

-+

5343

-+	return;

5344

-+}

5345

-+

5346

-+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

5347

-+{

5348

-+	if (bfqd->active_bic != NULL) {

5349

-+		put_io_context(bfqd->active_bic->icq.ioc);

5350

-+		bfqd->active_bic = NULL;

5351

-+	}

5352

-+

5353

-+	bfqd->active_queue = NULL;

5354

-+	del_timer(&bfqd->idle_slice_timer);

5355

-+}

5356

-+

5357

-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5358

-+				int requeue)

5359

-+{

5360

-+	struct bfq_entity *entity = &bfqq->entity;

5361

-+

5362

-+	if (bfqq == bfqd->active_queue)

5363

-+		__bfq_bfqd_reset_active(bfqd);

5364

-+

5365

-+	bfq_deactivate_entity(entity, requeue);

5366

-+}

5367

-+

5368

-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5369

-+{

5370

-+	struct bfq_entity *entity = &bfqq->entity;

5371

-+

5372

-+	bfq_activate_entity(entity);

5373

-+}

5374

-+

5375

-+/*

5376

-+ * Called when the bfqq no longer has requests pending, remove it from

5377

-+ * the service tree.

5378

-+ */

5379

-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

5380

-+			      int requeue)

5381

-+{

5382

-+	BUG_ON(!bfq_bfqq_busy(bfqq));

5383

-+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

5384

-+

5385

-+	bfq_log_bfqq(bfqd, bfqq, "del from busy");

5386

-+

5387

-+	bfq_clear_bfqq_busy(bfqq);

5388

-+

5389

-+	BUG_ON(bfqd->busy_queues == 0);

5390

-+	bfqd->busy_queues--;

5391

-+

5392

-+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

5393

-+}

5394

-+

5395

-+/*

5396

-+ * Called when an inactive queue receives a new request.

5397

-+ */

5398

-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

5399

-+{

5400

-+	BUG_ON(bfq_bfqq_busy(bfqq));

5401

-+	BUG_ON(bfqq == bfqd->active_queue);

5402

-+

5403

-+	bfq_log_bfqq(bfqd, bfqq, "add to busy");

5404

-+

5405

-+	bfq_activate_bfqq(bfqd, bfqq);

5406

-+

5407

-+	bfq_mark_bfqq_busy(bfqq);

5408

-+	bfqd->busy_queues++;

5409

-+}

5410

-diff --git a/block/bfq.h b/block/bfq.h

5411

-new file mode 100644

5412

-index 0000000..48ecde9

5413

---- /dev/null

5414

-+++ b/block/bfq.h

5415

-@@ -0,0 +1,603 @@

5416

-+/*

5417

-+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.

5418

-+ *

5419

-+ * Based on ideas and code from CFQ:

5420

-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5421

-+ *

5422

-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5423

-+ *		      Paolo Valente <paolo.valente@×××××××.it>

5424

-+ *

5425

-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5426

-+ */

5427

-+

5428

-+#ifndef _BFQ_H

5429

-+#define _BFQ_H

5430

-+

5431

-+#include <linux/blktrace_api.h>

5432

-+#include <linux/hrtimer.h>

5433

-+#include <linux/ioprio.h>

5434

-+#include <linux/rbtree.h>

5435

-+

5436

-+#define BFQ_IOPRIO_CLASSES	3

5437

-+#define BFQ_CL_IDLE_TIMEOUT	HZ/5

5438

-+

5439

-+#define BFQ_MIN_WEIGHT	1

5440

-+#define BFQ_MAX_WEIGHT	1000

5441

-+

5442

-+#define BFQ_DEFAULT_GRP_WEIGHT	10

5443

-+#define BFQ_DEFAULT_GRP_IOPRIO	0

5444

-+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

5445

-+

5446

-+struct bfq_entity;

5447

-+

5448

-+/**

5449

-+ * struct bfq_service_tree - per ioprio_class service tree.

5450

-+ * @active: tree for active entities (i.e., those backlogged).

5451

-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

5452

-+ * @first_idle: idle entity with minimum F_i.

5453

-+ * @last_idle: idle entity with maximum F_i.

5454

-+ * @vtime: scheduler virtual time.

5455

-+ * @wsum: scheduler weight sum; active and idle entities contribute to it.

5456

-+ *

5457

-+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

5458

-+ * ioprio_class has its own independent scheduler, and so its own

5459

-+ * bfq_service_tree.  All the fields are protected by the queue lock

5460

-+ * of the containing bfqd.

5461

-+ */

5462

-+struct bfq_service_tree {

5463

-+	struct rb_root active;

5464

-+	struct rb_root idle;

5465

-+

5466

-+	struct bfq_entity *first_idle;

5467

-+	struct bfq_entity *last_idle;

5468

-+

5469

-+	u64 vtime;

5470

-+	unsigned long wsum;

5471

-+};

5472

-+

5473

-+/**

5474

-+ * struct bfq_sched_data - multi-class scheduler.

5475

-+ * @active_entity: entity under service.

5476

-+ * @next_active: head-of-the-line entity in the scheduler.

5477

-+ * @service_tree: array of service trees, one per ioprio_class.

5478

-+ *

5479

-+ * bfq_sched_data is the basic scheduler queue.  It supports three

5480

-+ * ioprio_classes, and can be used either as a toplevel queue or as

5481

-+ * an intermediate queue on a hierarchical setup.

5482

-+ * @next_active points to the active entity of the sched_data service

5483

-+ * trees that will be scheduled next.

5484

-+ *

5485

-+ * The supported ioprio_classes are the same as in CFQ, in descending

5486

-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

5487

-+ * Requests from higher priority queues are served before all the

5488

-+ * requests from lower priority queues; among requests of the same

5489

-+ * queue requests are served according to B-WF2Q+.

5490

-+ * All the fields are protected by the queue lock of the containing bfqd.

5491

-+ */

5492

-+struct bfq_sched_data {

5493

-+	struct bfq_entity *active_entity;

5494

-+	struct bfq_entity *next_active;

5495

-+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

5496

-+};

5497

-+

5498

-+/**

5499

-+ * struct bfq_entity - schedulable entity.

5500

-+ * @rb_node: service_tree member.

5501

-+ * @on_st: flag, true if the entity is on a tree (either the active or

5502

-+ *         the idle one of its service_tree).

5503

-+ * @finish: B-WF2Q+ finish timestamp (aka F_i).

5504

-+ * @start: B-WF2Q+ start timestamp (aka S_i).

5505

-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

5506

-+ * @min_start: minimum start time of the (active) subtree rooted at

5507

-+ *             this entity; used for O(log N) lookups into active trees.

5508

-+ * @service: service received during the last round of service.

5509

-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

5510

-+ * @weight: weight of the queue

5511

-+ * @parent: parent entity, for hierarchical scheduling.

5512

-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

5513

-+ *                 associated scheduler queue, %NULL on leaf nodes.

5514

-+ * @sched_data: the scheduler queue this entity belongs to.

5515

-+ * @ioprio: the ioprio in use.

5516

-+ * @new_weight: when a weight change is requested, the new weight value.

5517

-+ * @orig_weight: original weight, used to implement weight boosting

5518

-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

5519

-+ * @ioprio_class: the ioprio_class in use.

5520

-+ * @new_ioprio_class: when an ioprio_class change is requested, the new

5521

-+ *                    ioprio_class value.

5522

-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

5523

-+ *                  ioprio_class change.

5524

-+ *

5525

-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

5526

-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

5527

-+ * entity belongs to the sched_data of the parent group in the cgroup

5528

-+ * hierarchy.  Non-leaf entities have also their own sched_data, stored

5529

-+ * in @my_sched_data.

5530

-+ *

5531

-+ * Each entity stores independently its priority values; this would

5532

-+ * allow different weights on different devices, but this

5533

-+ * functionality is not exported to userspace by now.  Priorities and

5534

-+ * weights are updated lazily, first storing the new values into the

5535

-+ * new_* fields, then setting the @ioprio_changed flag.  As soon as

5536

-+ * there is a transition in the entity state that allows the priority

5537

-+ * update to take place the effective and the requested priority

5538

-+ * values are synchronized.

5539

-+ *

5540

-+ * Unless cgroups are used, the weight value is calculated from the

5541

-+ * ioprio to export the same interface as CFQ.  When dealing with

5542

-+ * ``well-behaved'' queues (i.e., queues that do not spend too much

5543

-+ * time to consume their budget and have true sequential behavior, and

5544

-+ * when there are no external factors breaking anticipation) the

5545

-+ * relative weights at each level of the cgroups hierarchy should be

5546

-+ * guaranteed.  All the fields are protected by the queue lock of the

5547

-+ * containing bfqd.

5548

-+ */

5549

-+struct bfq_entity {

5550

-+	struct rb_node rb_node;

5551

-+

5552

-+	int on_st;

5553

-+

5554

-+	u64 finish;

5555

-+	u64 start;

5556

-+

5557

-+	struct rb_root *tree;

5558

-+

5559

-+	u64 min_start;

5560

-+

5561

-+	unsigned long service, budget;

5562

-+	unsigned short weight, new_weight;

5563

-+	unsigned short orig_weight;

5564

-+

5565

-+	struct bfq_entity *parent;

5566

-+

5567

-+	struct bfq_sched_data *my_sched_data;

5568

-+	struct bfq_sched_data *sched_data;

5569

-+

5570

-+	unsigned short ioprio, new_ioprio;

5571

-+	unsigned short ioprio_class, new_ioprio_class;

5572

-+

5573

-+	int ioprio_changed;

5574

-+};

5575

-+

5576

-+struct bfq_group;

5577

-+

5578

-+/**

5579

-+ * struct bfq_queue - leaf schedulable entity.

5580

-+ * @ref: reference counter.

5581

-+ * @bfqd: parent bfq_data.

5582

-+ * @new_bfqq: shared bfq_queue if queue is cooperating with

5583

-+ *           one or more other queues.

5584

-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

5585

-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

5586

-+ * @sort_list: sorted list of pending requests.

5587

-+ * @next_rq: if fifo isn't expired, next request to serve.

5588

-+ * @queued: nr of requests queued in @sort_list.

5589

-+ * @allocated: currently allocated requests.

5590

-+ * @meta_pending: pending metadata requests.

5591

-+ * @fifo: fifo list of requests in sort_list.

5592

-+ * @entity: entity representing this queue in the scheduler.

5593

-+ * @max_budget: maximum budget allowed from the feedback mechanism.

5594

-+ * @budget_timeout: budget expiration (in jiffies).

5595

-+ * @dispatched: number of requests on the dispatch list or inside driver.

5596

-+ * @org_ioprio: saved ioprio during boosted periods.

5597

-+ * @flags: status flags.

5598

-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

5599

-+ * @seek_samples: number of seeks sampled

5600

-+ * @seek_total: sum of the distances of the seeks sampled

5601

-+ * @seek_mean: mean seek distance

5602

-+ * @last_request_pos: position of the last request enqueued

5603

-+ * @pid: pid of the process owning the queue, used for logging purposes.

5604

-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

5605

-+ * @raising_cur_max_time: current max raising time for this queue

5606

-+ *

5607

-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context

5608

-+ * or more (if it is an async one).  @cgroup holds a reference to the

5609

-+ * cgroup, to be sure that it does not disappear while a bfqq still

5610

-+ * references it (mostly to avoid races between request issuing and task

5611

-+ * migration followed by cgroup distruction).

5612

-+ * All the fields are protected by the queue lock of the containing bfqd.

5613

-+ */

5614

-+struct bfq_queue {

5615

-+	atomic_t ref;

5616

-+	struct bfq_data *bfqd;

5617

-+

5618

-+	/* fields for cooperating queues handling */

5619

-+	struct bfq_queue *new_bfqq;

5620

-+	struct rb_node pos_node;

5621

-+	struct rb_root *pos_root;

5622

-+

5623

-+	struct rb_root sort_list;

5624

-+	struct request *next_rq;

5625

-+	int queued[2];

5626

-+	int allocated[2];

5627

-+	int meta_pending;

5628

-+	struct list_head fifo;

5629

-+

5630

-+	struct bfq_entity entity;

5631

-+

5632

-+	unsigned long max_budget;

5633

-+	unsigned long budget_timeout;

5634

-+

5635

-+	int dispatched;

5636

-+

5637

-+	unsigned short org_ioprio;

5638

-+

5639

-+	unsigned int flags;

5640

-+

5641

-+	struct list_head bfqq_list;

5642

-+

5643

-+	unsigned int seek_samples;

5644

-+	u64 seek_total;

5645

-+	sector_t seek_mean;

5646

-+	sector_t last_request_pos;

5647

-+

5648

-+	pid_t pid;

5649

-+

5650

-+	/* weight-raising fields */

5651

-+	unsigned int raising_cur_max_time;

5652

-+	u64 last_rais_start_finish, soft_rt_next_start;

5653

-+	unsigned int raising_coeff;

5654

-+};

5655

-+

5656

-+/**

5657

-+ * struct bfq_ttime - per process thinktime stats.

5658

-+ * @ttime_total: total process thinktime

5659

-+ * @ttime_samples: number of thinktime samples

5660

-+ * @ttime_mean: average process thinktime

5661

-+ */

5662

-+struct bfq_ttime {

5663

-+	unsigned long last_end_request;

5664

-+

5665

-+	unsigned long ttime_total;

5666

-+	unsigned long ttime_samples;

5667

-+	unsigned long ttime_mean;

5668

-+};

5669

-+

5670

-+/**

5671

-+ * struct bfq_io_cq - per (request_queue, io_context) structure.

5672

-+ * @icq: associated io_cq structure

5673

-+ * @bfqq: array of two process queues, the sync and the async

5674

-+ * @ttime: associated @bfq_ttime struct

5675

-+ */

5676

-+struct bfq_io_cq {

5677

-+	struct io_cq icq; /* must be the first member */

5678

-+	struct bfq_queue *bfqq[2];

5679

-+	struct bfq_ttime ttime;

5680

-+	int ioprio;

5681

-+};

5682

-+

5683

-+/**

5684

-+ * struct bfq_data - per device data structure.

5685

-+ * @queue: request queue for the managed device.

5686

-+ * @root_group: root bfq_group for the device.

5687

-+ * @rq_pos_tree: rbtree sorted by next_request position,

5688

-+ *		used when determining if two or more queues

5689

-+ *		have interleaving requests (see bfq_close_cooperator).

5690

-+ * @busy_queues: number of bfq_queues containing requests (including the

5691

-+ *		 queue under service, even if it is idling).

5692

-+ * @queued: number of queued requests.

5693

-+ * @rq_in_driver: number of requests dispatched and waiting for completion.

5694

-+ * @sync_flight: number of sync requests in the driver.

5695

-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

5696

-+ *		      completed requests .

5697

-+ * @hw_tag_samples: nr of samples used to calculate hw_tag.

5698

-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

5699

-+ * @budgets_assigned: number of budgets assigned.

5700

-+ * @idle_slice_timer: timer set when idling for the next sequential request

5701

-+ *                    from the queue under service.

5702

-+ * @unplug_work: delayed work to restart dispatching on the request queue.

5703

-+ * @active_queue: bfq_queue under service.

5704

-+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.

5705

-+ * @last_position: on-disk position of the last served request.

5706

-+ * @last_budget_start: beginning of the last budget.

5707

-+ * @last_idling_start: beginning of the last idle slice.

5708

-+ * @peak_rate: peak transfer rate observed for a budget.

5709

-+ * @peak_rate_samples: number of samples used to calculate @peak_rate.

5710

-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

5711

-+ * @group_list: list of all the bfq_groups active on the device.

5712

-+ * @active_list: list of all the bfq_queues active on the device.

5713

-+ * @idle_list: list of all the bfq_queues idle on the device.

5714

-+ * @bfq_quantum: max number of requests dispatched per dispatch round.

5715

-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

5716

-+ *                   requests are served in fifo order.

5717

-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

5718

-+ * @bfq_back_max: maximum allowed backward seek.

5719

-+ * @bfq_slice_idle: maximum idling time.

5720

-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

5721

-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

5722

-+ *                           async queues.

5723

-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

5724

-+ *               to prevent seeky queues to impose long latencies to well

5725

-+ *               behaved ones (this also implies that seeky queues cannot

5726

-+ *               receive guarantees in the service domain; after a timeout

5727

-+ *               they are charged for the whole allocated budget, to try

5728

-+ *               to preserve a behavior reasonably fair among them, but

5729

-+ *               without service-domain guarantees).

5730

-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

5731

-+ *                            queue is multiplied

5732

-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

5733

-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

5734

-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

5735

-+ *			       may be reactivated for a queue (in jiffies)

5736

-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

5737

-+ *				     after which weight-raising may be

5738

-+ *				     reactivated for an already busy queue

5739

-+ *				     (in jiffies)

5740

-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

5741

-+ *			         sectors per seconds

5742

-+ * @RT_prod: cached value of the product R*T used for computing the maximum

5743

-+ * 	     duration of the weight raising automatically

5744

-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

5745

-+ *

5746

-+ * All the fields are protected by the @queue lock.

5747

-+ */

5748

-+struct bfq_data {

5749

-+	struct request_queue *queue;

5750

-+

5751

-+	struct bfq_group *root_group;

5752

-+

5753

-+	struct rb_root rq_pos_tree;

5754

-+

5755

-+	int busy_queues;

5756

-+	int queued;

5757

-+	int rq_in_driver;

5758

-+	int sync_flight;

5759

-+

5760

-+	int max_rq_in_driver;

5761

-+	int hw_tag_samples;

5762

-+	int hw_tag;

5763

-+

5764

-+	int budgets_assigned;

5765

-+

5766

-+	struct timer_list idle_slice_timer;

5767

-+	struct work_struct unplug_work;

5768

-+

5769

-+	struct bfq_queue *active_queue;

5770

-+	struct bfq_io_cq *active_bic;

5771

-+

5772

-+	sector_t last_position;

5773

-+

5774

-+	ktime_t last_budget_start;

5775

-+	ktime_t last_idling_start;

5776

-+	int peak_rate_samples;

5777

-+	u64 peak_rate;

5778

-+	unsigned long bfq_max_budget;

5779

-+

5780

-+	struct hlist_head group_list;

5781

-+	struct list_head active_list;

5782

-+	struct list_head idle_list;

5783

-+

5784

-+	unsigned int bfq_quantum;

5785

-+	unsigned int bfq_fifo_expire[2];

5786

-+	unsigned int bfq_back_penalty;

5787

-+	unsigned int bfq_back_max;

5788

-+	unsigned int bfq_slice_idle;

5789

-+	u64 bfq_class_idle_last_service;

5790

-+

5791

-+	unsigned int bfq_user_max_budget;

5792

-+	unsigned int bfq_max_budget_async_rq;

5793

-+	unsigned int bfq_timeout[2];

5794

-+

5795

-+	bool low_latency;

5796

-+

5797

-+	/* parameters of the low_latency heuristics */

5798

-+	unsigned int bfq_raising_coeff;

5799

-+	unsigned int bfq_raising_max_time;

5800

-+	unsigned int bfq_raising_rt_max_time;

5801

-+	unsigned int bfq_raising_min_idle_time;

5802

-+	unsigned int bfq_raising_min_inter_arr_async;

5803

-+	unsigned int bfq_raising_max_softrt_rate;

5804

-+	u64 RT_prod;

5805

-+

5806

-+	struct bfq_queue oom_bfqq;

5807

-+};

5808

-+

5809

-+enum bfqq_state_flags {

5810

-+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

5811

-+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

5812

-+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

5813

-+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

5814

-+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

5815

-+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

5816

-+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

5817

-+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

5818

-+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

5819

-+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

5820

-+	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

5821

-+};

5822

-+

5823

-+#define BFQ_BFQQ_FNS(name)						\

5824

-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

5825

-+{									\

5826

-+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

5827

-+}									\

5828

-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

5829

-+{									\

5830

-+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

5831

-+}									\

5832

-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

5833

-+{									\

5834

-+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

5835

-+}

5836

-+

5837

-+BFQ_BFQQ_FNS(busy);

5838

-+BFQ_BFQQ_FNS(wait_request);

5839

-+BFQ_BFQQ_FNS(must_alloc);

5840

-+BFQ_BFQQ_FNS(fifo_expire);

5841

-+BFQ_BFQQ_FNS(idle_window);

5842

-+BFQ_BFQQ_FNS(prio_changed);

5843

-+BFQ_BFQQ_FNS(sync);

5844

-+BFQ_BFQQ_FNS(budget_new);

5845

-+BFQ_BFQQ_FNS(coop);

5846

-+BFQ_BFQQ_FNS(split_coop);

5847

-+BFQ_BFQQ_FNS(some_coop_idle);

5848

-+#undef BFQ_BFQQ_FNS

5849

-+

5850

-+/* Logging facilities. */

5851

-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

5852

-+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

5853

-+

5854

-+#define bfq_log(bfqd, fmt, args...) \

5855

-+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

5856

-+

5857

-+/* Expiration reasons. */

5858

-+enum bfqq_expiration {

5859

-+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

5860

-+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

5861

-+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

5862

-+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

5863

-+};

5864

-+

5865

-+#ifdef CONFIG_CGROUP_BFQIO

5866

-+/**

5867

-+ * struct bfq_group - per (device, cgroup) data structure.

5868

-+ * @entity: schedulable entity to insert into the parent group sched_data.

5869

-+ * @sched_data: own sched_data, to contain child entities (they may be

5870

-+ *              both bfq_queues and bfq_groups).

5871

-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data

5872

-+ *              list of the containing cgroup's bfqio_cgroup.

5873

-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list

5874

-+ *             of the groups active on the same device; used for cleanup.

5875

-+ * @bfqd: the bfq_data for the device this group acts upon.

5876

-+ * @async_bfqq: array of async queues for all the tasks belonging to

5877

-+ *              the group, one queue per ioprio value per ioprio_class,

5878

-+ *              except for the idle class that has only one queue.

5879

-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

5880

-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

5881

-+ *             to avoid too many special cases during group creation/migration.

5882

-+ *

5883

-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

5884

-+ * there is a set of bfq_groups, each one collecting the lower-level

5885

-+ * entities belonging to the group that are acting on the same device.

5886

-+ *

5887

-+ * Locking works as follows:

5888

-+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

5889

-+ *      via RCU from its readers.

5890

-+ *    o @bfqd is protected by the queue lock, RCU is used to access it

5891

-+ *      from the readers.

5892

-+ *    o All the other fields are protected by the @bfqd queue lock.

5893

-+ */

5894

-+struct bfq_group {

5895

-+	struct bfq_entity entity;

5896

-+	struct bfq_sched_data sched_data;

5897

-+

5898

-+	struct hlist_node group_node;

5899

-+	struct hlist_node bfqd_node;

5900

-+

5901

-+	void *bfqd;

5902

-+

5903

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

5904

-+	struct bfq_queue *async_idle_bfqq;

5905

-+

5906

-+	struct bfq_entity *my_entity;

5907

-+};

5908

-+

5909

-+/**

5910

-+ * struct bfqio_cgroup - bfq cgroup data structure.

5911

-+ * @css: subsystem state for bfq in the containing cgroup.

5912

-+ * @weight: cgroup weight.

5913

-+ * @ioprio: cgroup ioprio.

5914

-+ * @ioprio_class: cgroup ioprio_class.

5915

-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

5916

-+ * @group_data: list containing the bfq_group belonging to this cgroup.

5917

-+ *

5918

-+ * @group_data is accessed using RCU, with @lock protecting the updates,

5919

-+ * @ioprio and @ioprio_class are protected by @lock.

5920

-+ */

5921

-+struct bfqio_cgroup {

5922

-+	struct cgroup_subsys_state css;

5923

-+

5924

-+	unsigned short weight, ioprio, ioprio_class;

5925

-+

5926

-+	spinlock_t lock;

5927

-+	struct hlist_head group_data;

5928

-+};

5929

-+#else

5930

-+struct bfq_group {

5931

-+	struct bfq_sched_data sched_data;

5932

-+

5933

-+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

5934

-+	struct bfq_queue *async_idle_bfqq;

5935

-+};

5936

-+#endif

5937

-+

5938

-+static inline struct bfq_service_tree *

5939

-+bfq_entity_service_tree(struct bfq_entity *entity)

5940

-+{

5941

-+	struct bfq_sched_data *sched_data = entity->sched_data;

5942

-+	unsigned int idx = entity->ioprio_class - 1;

5943

-+

5944

-+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

5945

-+	BUG_ON(sched_data == NULL);

5946

-+

5947

-+	return sched_data->service_tree + idx;

5948

-+}

5949

-+

5950

-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

5951

-+					    int is_sync)

5952

-+{

5953

-+	return bic->bfqq[!!is_sync];

5954

-+}

5955

-+

5956

-+static inline void bic_set_bfqq(struct bfq_io_cq *bic,

5957

-+				struct bfq_queue *bfqq, int is_sync)

5958

-+{

5959

-+	bic->bfqq[!!is_sync] = bfqq;

5960

-+}

5961

-+

5962

-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

5963

-+{

5964

-+	return bic->icq.q->elevator->elevator_data;

5965

-+}

5966

-+

5967

-+/**

5968

-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

5969

-+ * @ptr: a pointer to a bfqd.

5970

-+ * @flags: storage for the flags to be saved.

5971

-+ *

5972

-+ * This function allows bfqg->bfqd to be protected by the

5973

-+ * queue lock of the bfqd they reference; the pointer is dereferenced

5974

-+ * under RCU, so the storage for bfqd is assured to be safe as long

5975

-+ * as the RCU read side critical section does not end.  After the

5976

-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

5977

-+ * sure that no other writer accessed it.  If we raced with a writer,

5978

-+ * the function returns NULL, with the queue unlocked, otherwise it

5979

-+ * returns the dereferenced pointer, with the queue locked.

5980

-+ */

5981

-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

5982

-+						   unsigned long *flags)

5983

-+{

5984

-+	struct bfq_data *bfqd;

5985

-+

5986

-+	rcu_read_lock();

5987

-+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

5988

-+

5989

-+	if (bfqd != NULL) {

5990

-+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

5991

-+		if (*ptr == bfqd)

5992

-+			goto out;

5993

-+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

5994

-+	}

5995

-+

5996

-+	bfqd = NULL;

5997

-+out:

5998

-+	rcu_read_unlock();

5999

-+	return bfqd;

6000

-+}

6001

-+

6002

-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

6003

-+				       unsigned long *flags)

6004

-+{

6005

-+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

6006

-+}

6007

-+

6008

-+static void bfq_changed_ioprio(struct bfq_io_cq *bic);

6009

-+static void bfq_put_queue(struct bfq_queue *bfqq);

6010

-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

6011

-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

6012

-+				       struct bfq_group *bfqg, int is_sync,

6013

-+				       struct bfq_io_cq *bic, gfp_t gfp_mask);

6014

-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

6015

-+					 struct bfq_group *bfqg);

6016

-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

6017

-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

6018

-+#endif

6019

---

6020

-1.8.1.4

6021

-

6022

6023

Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

6024

===================================================================

6025

--- genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	2013-11-04 00:52:35 UTC (rev 2564)

6026

+++ genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	2013-11-04 10:09:31 UTC (rev 2565)

6027

@@ -1,1049 +0,0 @@

6028

-From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001

6029

-From: Mauro Andreolini <mauro.andreolini@×××××××.it>

6030

-Date: Fri, 14 Jun 2013 13:46:47 +0200

6031

-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for

6032

- 3.11.0

6033

-

6034

-A set of processes may happen  to  perform interleaved reads, i.e., requests

6035

-whose union would give rise to a  sequential read  pattern.  There   are two

6036

-typical  cases: in the first  case,   processes  read  fixed-size  chunks of

6037

-data at a fixed distance from each other, while in the second case processes

6038

-may read variable-size chunks at  variable distances. The latter case occurs

6039

-for  example with  KVM, which  splits the  I/O generated  by the  guest into

6040

-multiple chunks,  and lets these chunks  be served by a  pool of cooperating

6041

-processes,  iteratively  assigning  the  next  chunk of  I/O  to  the  first

6042

-available  process. CFQ  uses actual  queue merging  for the  first type  of

6043

-processes, whereas it  uses preemption to get a sequential  read pattern out

6044

-of the read requests  performed by the second type of  processes. In the end

6045

-it uses  two different  mechanisms to  achieve the  same goal:  boosting the

6046

-throughput with interleaved I/O.

6047

-

6048

-This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

6049

-sequential  read pattern  with both  types of  processes. The  main idea  is

6050

-checking newly arrived requests against the next request of the active queue

6051

-both in case of actual request insert and in case of request merge. By doing

6052

-so, both the types of processes can be handled by just merging their queues.

6053

-EQM is  then simpler and  more compact than the  pair of mechanisms  used in

6054

-CFQ.

6055

-

6056

-Finally, EQM  also preserves the  typical low-latency properties of  BFQ, by

6057

-properly restoring the weight-raising state of  a queue when it gets back to

6058

-a non-merged state.

6059

-

6060

-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

6061

-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

6062

-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

6063

----

6064

- block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------

6065

- block/bfq-sched.c   |  28 ---

6066

- block/bfq.h         |  16 ++

6067

- 3 files changed, 466 insertions(+), 231 deletions(-)

6068

-

6069

-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

6070

-index 0ed2746..bbe79fb 100644

6071

---- a/block/bfq-iosched.c

6072

-+++ b/block/bfq-iosched.c

6073

-@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

6074

- 	return dur;

6075

- }

6076

-

6077

-+static inline void

6078

-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

6079

-+{

6080

-+	if (bic->saved_idle_window)

6081

-+		bfq_mark_bfqq_idle_window(bfqq);

6082

-+	else

6083

-+		bfq_clear_bfqq_idle_window(bfqq);

6084

-+	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

6085

-+		/*

6086

-+		 * Start a weight raising period with the duration given by

6087

-+		 * the raising_time_left snapshot.

6088

-+		 */

6089

-+		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

6090

-+		bfqq->raising_cur_max_time = bic->raising_time_left;

6091

-+		bfqq->last_rais_start_finish = jiffies;

6092

-+	}

6093

-+	/*

6094

-+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

6095

-+	 * getting confused about the queue's need of a weight-raising

6096

-+	 * period.

6097

-+	 */

6098

-+	bic->raising_time_left = 0;

6099

-+}

6100

-+

6101

-+/*

6102

-+ * Must be called with the queue_lock held.

6103

-+ */

6104

-+static int bfqq_process_refs(struct bfq_queue *bfqq)

6105

-+{

6106

-+	int process_refs, io_refs;

6107

-+

6108

-+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

6109

-+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

6110

-+	BUG_ON(process_refs < 0);

6111

-+	return process_refs;

6112

-+}

6113

-+

6114

- static void bfq_add_rq_rb(struct request *rq)

6115

- {

6116

- 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

6117

-@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)

6118

- 		if (! bfqd->low_latency)

6119

- 			goto add_bfqq_busy;

6120

-

6121

-+		if (bfq_bfqq_just_split(bfqq))

6122

-+			goto set_ioprio_changed;

6123

-+

6124

- 		/*

6125

--		 * If the queue is not being boosted and has been idle

6126

--		 * for enough time, start a weight-raising period

6127

-+		 * If the queue:

6128

-+		 * - is not being boosted,

6129

-+		 * - has been idle for enough time,

6130

-+		 * - is not a sync queue or is linked to a bfq_io_cq (it is

6131

-+		 *   shared "for its nature" or it is not shared and its

6132

-+		 *   requests have not been redirected to a shared queue)

6133

-+		 * start a weight-raising period.

6134

- 		 */

6135

--		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

6136

-+		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

6137

-+		   (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

6138

- 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

6139

- 			if (idle_for_long_time)

6140

- 				bfqq->raising_cur_max_time =

6141

-@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)

6142

- 						raising_cur_max_time));

6143

- 				}

6144

- 		}

6145

-+set_ioprio_changed:

6146

- 		if (old_raising_coeff != bfqq->raising_coeff)

6147

- 			entity->ioprio_changed = 1;

6148

- add_bfqq_busy:

6149

-@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

6150

- 	spin_unlock_irq(bfqd->queue->queue_lock);

6151

- }

6152

-

6153

--static int bfq_allow_merge(struct request_queue *q, struct request *rq,

6154

--			   struct bio *bio)

6155

-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

6156

- {

6157

--	struct bfq_data *bfqd = q->elevator->elevator_data;

6158

--	struct bfq_io_cq *bic;

6159

--	struct bfq_queue *bfqq;

6160

--

6161

--	/*

6162

--	 * Disallow merge of a sync bio into an async request.

6163

--	 */

6164

--	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

6165

--		return 0;

6166

--

6167

--	/*

6168

--	 * Lookup the bfqq that this bio will be queued with. Allow

6169

--	 * merge only if rq is queued there.

6170

--	 * Queue lock is held here.

6171

--	 */

6172

--	bic = bfq_bic_lookup(bfqd, current->io_context);

6173

--	if (bic == NULL)

6174

--		return 0;

6175

--

6176

--	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

6177

--	return bfqq == RQ_BFQQ(rq);

6178

--}

6179

--

6180

--static void __bfq_set_active_queue(struct bfq_data *bfqd,

6181

--				   struct bfq_queue *bfqq)

6182

--{

6183

--	if (bfqq != NULL) {

6184

--		bfq_mark_bfqq_must_alloc(bfqq);

6185

--		bfq_mark_bfqq_budget_new(bfqq);

6186

--		bfq_clear_bfqq_fifo_expire(bfqq);

6187

--

6188

--		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

6189

--

6190

--		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

6191

--			     bfqq->entity.budget);

6192

--	}

6193

--

6194

--	bfqd->active_queue = bfqq;

6195

--}

6196

--

6197

--/*

6198

-- * Get and set a new active queue for service.

6199

-- */

6200

--static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

6201

--					      struct bfq_queue *bfqq)

6202

--{

6203

--	if (!bfqq)

6204

--		bfqq = bfq_get_next_queue(bfqd);

6205

-+	if (request)

6206

-+		return blk_rq_pos(io_struct);

6207

- 	else

6208

--		bfq_get_next_queue_forced(bfqd, bfqq);

6209

--

6210

--	__bfq_set_active_queue(bfqd, bfqq);

6211

--	return bfqq;

6212

-+		return ((struct bio *)io_struct)->bi_sector;

6213

- }

6214

-

6215

--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

6216

--					  struct request *rq)

6217

-+static inline sector_t bfq_dist_from(sector_t pos1,

6218

-+				     sector_t pos2)

6219

- {

6220

--	if (blk_rq_pos(rq) >= bfqd->last_position)

6221

--		return blk_rq_pos(rq) - bfqd->last_position;

6222

-+	if (pos1 >= pos2)

6223

-+		return pos1 - pos2;

6224

- 	else

6225

--		return bfqd->last_position - blk_rq_pos(rq);

6226

-+		return pos2 - pos1;

6227

- }

6228

-

6229

--/*

6230

-- * Return true if bfqq has no request pending and rq is close enough to

6231

-- * bfqd->last_position, or if rq is closer to bfqd->last_position than

6232

-- * bfqq->next_rq

6233

-- */

6234

--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

6235

-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

6236

-+					 sector_t sector)

6237

- {

6238

--	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

6239

-+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

6240

-+	       BFQQ_SEEK_THR;

6241

- }

6242

-

6243

--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6244

-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

6245

- {

6246

- 	struct rb_root *root = &bfqd->rq_pos_tree;

6247

- 	struct rb_node *parent, *node;

6248

- 	struct bfq_queue *__bfqq;

6249

--	sector_t sector = bfqd->last_position;

6250

-

6251

- 	if (RB_EMPTY_ROOT(root))

6252

- 		return NULL;

6253

-@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6254

- 	 * position).

6255

- 	 */

6256

- 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

6257

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

6258

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

6259

- 		return __bfqq;

6260

-

6261

- 	if (blk_rq_pos(__bfqq->next_rq) < sector)

6262

-@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6263

- 		return NULL;

6264

-

6265

- 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

6266

--	if (bfq_rq_close(bfqd, __bfqq->next_rq))

6267

-+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

6268

- 		return __bfqq;

6269

-

6270

- 	return NULL;

6271

-@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

6272

- /*

6273

-  * bfqd - obvious

6274

-  * cur_bfqq - passed in so that we don't decide that the current queue

6275

-- *            is closely cooperating with itself.

6276

-- *

6277

-- * We are assuming that cur_bfqq has dispatched at least one request,

6278

-- * and that bfqd->last_position reflects a position on the disk associated

6279

-- * with the I/O issued by cur_bfqq.

6280

-+ *            is closely cooperating with itself

6281

-+ * sector - used as a reference point to search for a close queue

6282

-  */

6283

- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6284

--					      struct bfq_queue *cur_bfqq)

6285

-+					      struct bfq_queue *cur_bfqq,

6286

-+					      sector_t sector)

6287

- {

6288

- 	struct bfq_queue *bfqq;

6289

-

6290

-@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6291

- 	 * working closely on the same area of the disk. In that case,

6292

- 	 * we can group them together and don't waste time idling.

6293

- 	 */

6294

--	bfqq = bfqq_close(bfqd);

6295

-+	bfqq = bfqq_close(bfqd, sector);

6296

- 	if (bfqq == NULL || bfqq == cur_bfqq)

6297

- 		return NULL;

6298

-

6299

-@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

6300

- 	return bfqq;

6301

- }

6302

-

6303

-+static struct bfq_queue *

6304

-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6305

-+{

6306

-+	int process_refs, new_process_refs;

6307

-+	struct bfq_queue *__bfqq;

6308

-+

6309

-+	/*

6310

-+	 * If there are no process references on the new_bfqq, then it is

6311

-+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

6312

-+	 * may have dropped their last reference (not just their last process

6313

-+	 * reference).

6314

-+	 */

6315

-+	if (!bfqq_process_refs(new_bfqq))

6316

-+		return NULL;

6317

-+

6318

-+	/* Avoid a circular list and skip interim queue merges. */

6319

-+	while ((__bfqq = new_bfqq->new_bfqq)) {

6320

-+		if (__bfqq == bfqq)

6321

-+			return NULL;

6322

-+		new_bfqq = __bfqq;

6323

-+	}

6324

-+

6325

-+	process_refs = bfqq_process_refs(bfqq);

6326

-+	new_process_refs = bfqq_process_refs(new_bfqq);

6327

-+	/*

6328

-+	 * If the process for the bfqq has gone away, there is no

6329

-+	 * sense in merging the queues.

6330

-+	 */

6331

-+	if (process_refs == 0 || new_process_refs == 0)

6332

-+		return NULL;

6333

-+

6334

-+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

6335

-+		new_bfqq->pid);

6336

-+

6337

-+	/*

6338

-+	 * Merging is just a redirection: the requests of the process owning

6339

-+	 * one of the two queues are redirected to the other queue. The latter

6340

-+	 * queue, in its turn, is set as shared if this is the first time that

6341

-+	 * the requests of some process are redirected to it.

6342

-+	 *

6343

-+	 * We redirect bfqq to new_bfqq and not the opposite, because we

6344

-+	 * are in the context of the process owning bfqq, hence we have the

6345

-+	 * io_cq of this process. So we can immediately configure this io_cq

6346

-+	 * to redirect the requests of the process to new_bfqq.

6347

-+	 *

6348

-+	 * NOTE, even if new_bfqq coincides with the active queue, the io_cq of

6349

-+	 * new_bfqq is not available, because, if the active queue is shared,

6350

-+	 * bfqd->active_bic may not point to the io_cq of the active queue.

6351

-+	 * Redirecting the requests of the process owning bfqq to the currently

6352

-+	 * active queue is in any case the best option, as we feed the active queue

6353

-+	 * with new requests close to the last request served and, by doing so,

6354

-+	 * hopefully increase the throughput.

6355

-+	 */

6356

-+	bfqq->new_bfqq = new_bfqq;

6357

-+	atomic_add(process_refs, &new_bfqq->ref);

6358

-+	return new_bfqq;

6359

-+}

6360

-+

6361

-+/*

6362

-+ * Attempt to schedule a merge of bfqq with the currently active queue or

6363

-+ * with a close queue among the scheduled queues.

6364

-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

6365

-+ * structure otherwise.

6366

-+ */

6367

-+static struct bfq_queue *

6368

-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6369

-+		     void *io_struct, bool request)

6370

-+{

6371

-+	struct bfq_queue *active_bfqq, *new_bfqq;

6372

-+

6373

-+	if (bfqq->new_bfqq)

6374

-+		return bfqq->new_bfqq;

6375

-+

6376

-+	if (!io_struct)

6377

-+		return NULL;

6378

-+

6379

-+	active_bfqq = bfqd->active_queue;

6380

-+

6381

-+	if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)

6382

-+		goto check_scheduled;

6383

-+

6384

-+	if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))

6385

-+		goto check_scheduled;

6386

-+

6387

-+	if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))

6388

-+		goto check_scheduled;

6389

-+

6390

-+	if (active_bfqq->entity.parent != bfqq->entity.parent)

6391

-+		goto check_scheduled;

6392

-+

6393

-+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

6394

-+	    bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))

6395

-+		if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))

6396

-+			return new_bfqq; /* Merge with the active queue */

6397

-+

6398

-+	/*

6399

-+	 * Check whether there is a cooperator among currently scheduled

6400

-+	 * queues. The only thing we need is that the bio/request is not

6401

-+	 * NULL, as we need it to establish whether a cooperator exists.

6402

-+	 */

6403

-+check_scheduled:

6404

-+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

6405

-+					bfq_io_struct_pos(io_struct, request));

6406

-+	if (new_bfqq)

6407

-+		return bfq_setup_merge(bfqq, new_bfqq);

6408

-+

6409

-+	return NULL;

6410

-+}

6411

-+

6412

-+static inline void

6413

-+bfq_bfqq_save_state(struct bfq_queue *bfqq)

6414

-+{

6415

-+	/*

6416

-+	 * If bfqq->bic == NULL, the queue is already shared or its requests

6417

-+	 * have already been redirected to a shared queue; both idle window

6418

-+	 * and weight raising state have already been saved. Do nothing.

6419

-+	 */

6420

-+	if (bfqq->bic == NULL)

6421

-+		return;

6422

-+	if (bfqq->bic->raising_time_left)

6423

-+		/*

6424

-+		 * This is the queue of a just-started process, and would

6425

-+		 * deserve weight raising: we set raising_time_left to the full

6426

-+		 * weight-raising duration to trigger weight-raising when and

6427

-+		 * if the queue is split and the first request of the queue

6428

-+		 * is enqueued.

6429

-+		 */

6430

-+		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

6431

-+	else if (bfqq->raising_coeff > 1) {

6432

-+		unsigned long wrais_duration =

6433

-+			jiffies - bfqq->last_rais_start_finish;

6434

-+		/*

6435

-+		 * It may happen that a queue's weight raising period lasts

6436

-+		 * longer than its raising_cur_max_time, as weight raising is

6437

-+		 * handled only when a request is enqueued or dispatched (it

6438

-+		 * does not use any timer). If the weight raising period is

6439

-+		 * about to end, don't save it.

6440

-+		 */

6441

-+		if (bfqq->raising_cur_max_time <= wrais_duration)

6442

-+			bfqq->bic->raising_time_left = 0;

6443

-+		else

6444

-+			bfqq->bic->raising_time_left =

6445

-+				bfqq->raising_cur_max_time - wrais_duration;

6446

-+		/*

6447

-+		 * The bfq_queue is becoming shared or the requests of the

6448

-+		 * process owning the queue are being redirected to a shared

6449

-+		 * queue. Stop the weight raising period of the queue, as in

6450

-+		 * both cases it should not be owned by an interactive or soft

6451

-+		 * real-time application.

6452

-+		 */

6453

-+		bfq_bfqq_end_raising(bfqq);

6454

-+	} else

6455

-+		bfqq->bic->raising_time_left = 0;

6456

-+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

6457

-+}

6458

-+

6459

-+static inline void

6460

-+bfq_get_bic_reference(struct bfq_queue *bfqq)

6461

-+{

6462

-+	/*

6463

-+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

6464

-+	 * is about to begin using a shared bfq_queue.

6465

-+	 */

6466

-+	if (bfqq->bic)

6467

-+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

6468

-+}

6469

-+

6470

-+static void

6471

-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

6472

-+                struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6473

-+{

6474

-+        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

6475

-+		(long unsigned)new_bfqq->pid);

6476

-+	/* Save weight raising and idle window of the merged queues */

6477

-+	bfq_bfqq_save_state(bfqq);

6478

-+	bfq_bfqq_save_state(new_bfqq);

6479

-+	/*

6480

-+	 * Grab a reference to the bic, to prevent it from being destroyed

6481

-+	 * before being possibly touched by a bfq_split_bfqq().

6482

-+	 */

6483

-+	bfq_get_bic_reference(bfqq);

6484

-+	bfq_get_bic_reference(new_bfqq);

6485

-+	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

6486

-+        bic_set_bfqq(bic, new_bfqq, 1);

6487

-+        bfq_mark_bfqq_coop(new_bfqq);

6488

-+	/*

6489

-+	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

6490

-+	 * new_bfqq->bic to NULL. bfqq either:

6491

-+	 * - does not belong to any bic any more, and hence bfqq->bic must

6492

-+	 *   be set to NULL, or

6493

-+	 * - is a queue whose owning bics have already been redirected to a

6494

-+	 *   different queue, hence the queue is destined to not belong to any

6495

-+	 *   bic soon and bfqq->bic is already NULL (therefore the next

6496

-+	 *   assignment causes no harm).

6497

-+	 */

6498

-+	new_bfqq->bic = NULL;

6499

-+	bfqq->bic = NULL;

6500

-+        bfq_put_queue(bfqq);

6501

-+}

6502

-+

6503

-+static int bfq_allow_merge(struct request_queue *q, struct request *rq,

6504

-+			   struct bio *bio)

6505

-+{

6506

-+	struct bfq_data *bfqd = q->elevator->elevator_data;

6507

-+	struct bfq_io_cq *bic;

6508

-+	struct bfq_queue *bfqq, *new_bfqq;

6509

-+

6510

-+	/*

6511

-+	 * Disallow merge of a sync bio into an async request.

6512

-+	 */

6513

-+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

6514

-+		return 0;

6515

-+

6516

-+	/*

6517

-+	 * Lookup the bfqq that this bio will be queued with. Allow

6518

-+	 * merge only if rq is queued there.

6519

-+	 * Queue lock is held here.

6520

-+	 */

6521

-+	bic = bfq_bic_lookup(bfqd, current->io_context);

6522

-+	if (bic == NULL)

6523

-+		return 0;

6524

-+

6525

-+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

6526

-+	/*

6527

-+	 * We take advantage of this function to perform an early merge

6528

-+	 * of the queues of possible cooperating processes.

6529

-+	 */

6530

-+	if (bfqq != NULL &&

6531

-+	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {

6532

-+		bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

6533

-+		/*

6534

-+		 * If we get here, the bio will be queued in the shared queue,

6535

-+		 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

6536

-+		 * rq can be merged.

6537

-+		 */

6538

-+		bfqq = new_bfqq;

6539

-+	}

6540

-+

6541

-+	return bfqq == RQ_BFQQ(rq);

6542

-+}

6543

-+

6544

-+static void __bfq_set_active_queue(struct bfq_data *bfqd,

6545

-+				   struct bfq_queue *bfqq)

6546

-+{

6547

-+	if (bfqq != NULL) {

6548

-+		bfq_mark_bfqq_must_alloc(bfqq);

6549

-+		bfq_mark_bfqq_budget_new(bfqq);

6550

-+		bfq_clear_bfqq_fifo_expire(bfqq);

6551

-+

6552

-+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

6553

-+

6554

-+		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

6555

-+			     bfqq->entity.budget);

6556

-+	}

6557

-+

6558

-+	bfqd->active_queue = bfqq;

6559

-+}

6560

-+

6561

-+/*

6562

-+ * Get and set a new active queue for service.

6563

-+ */

6564

-+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)

6565

-+{

6566

-+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

6567

-+

6568

-+	__bfq_set_active_queue(bfqd, bfqq);

6569

-+	return bfqq;

6570

-+}

6571

-+

6572

- /*

6573

-  * If enough samples have been computed, return the current max budget

6574

-  * stored in bfqd, which is dynamically updated according to the

6575

-@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

6576

- 	return rq;

6577

- }

6578

-

6579

--/*

6580

-- * Must be called with the queue_lock held.

6581

-- */

6582

--static int bfqq_process_refs(struct bfq_queue *bfqq)

6583

--{

6584

--	int process_refs, io_refs;

6585

--

6586

--	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

6587

--	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

6588

--	BUG_ON(process_refs < 0);

6589

--	return process_refs;

6590

--}

6591

--

6592

--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

6593

--{

6594

--	int process_refs, new_process_refs;

6595

--	struct bfq_queue *__bfqq;

6596

--

6597

--	/*

6598

--	 * If there are no process references on the new_bfqq, then it is

6599

--	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

6600

--	 * may have dropped their last reference (not just their last process

6601

--	 * reference).

6602

--	 */

6603

--	if (!bfqq_process_refs(new_bfqq))

6604

--		return;

6605

--

6606

--	/* Avoid a circular list and skip interim queue merges. */

6607

--	while ((__bfqq = new_bfqq->new_bfqq)) {

6608

--		if (__bfqq == bfqq)

6609

--			return;

6610

--		new_bfqq = __bfqq;

6611

--	}

6612

--

6613

--	process_refs = bfqq_process_refs(bfqq);

6614

--	new_process_refs = bfqq_process_refs(new_bfqq);

6615

--	/*

6616

--	 * If the process for the bfqq has gone away, there is no

6617

--	 * sense in merging the queues.

6618

--	 */

6619

--	if (process_refs == 0 || new_process_refs == 0)

6620

--		return;

6621

--

6622

--	/*

6623

--	 * Merge in the direction of the lesser amount of work.

6624

--	 */

6625

--	if (new_process_refs >= process_refs) {

6626

--		bfqq->new_bfqq = new_bfqq;

6627

--		atomic_add(process_refs, &new_bfqq->ref);

6628

--	} else {

6629

--		new_bfqq->new_bfqq = bfqq;

6630

--		atomic_add(new_process_refs, &bfqq->ref);

6631

--	}

6632

--	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

6633

--		new_bfqq->pid);

6634

--}

6635

--

6636

- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

6637

- {

6638

- 	struct bfq_entity *entity = &bfqq->entity;

6639

-@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

6640

-  *   is likely to boost the disk throughput);

6641

-  * - the queue is weight-raised (waiting for the request is necessary for

6642

-  *   providing the queue with fairness and latency guarantees).

6643

-+ *

6644

-+ * In any case, idling can be disabled for cooperation issues, if

6645

-+ * 1) there is a close cooperator for the queue, or

6646

-+ * 2) the queue is shared and some cooperator is likely to be idle (in this

6647

-+ *    case, by not arming the idle timer, we try to slow down the queue, to

6648

-+ *    prevent the zones of the disk accessed by the active cooperators to

6649

-+ *    become too distant from the zone that will be accessed by the currently

6650

-+ *    idle cooperators).

6651

-  */

6652

- static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

6653

- 				      int budg_timeout)

6654

-@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

6655

- 		(bfqd->rq_in_driver == 0 ||

6656

- 				budg_timeout ||

6657

-                                 bfqq->raising_coeff > 1) &&

6658

--                !bfq_close_cooperator(bfqd, bfqq) &&

6659

-+                !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&

6660

-                 (!bfq_bfqq_coop(bfqq) ||

6661

- 			!bfq_bfqq_some_coop_idle(bfqq)) &&

6662

- 		!bfq_queue_nonrot_noidle(bfqd, bfqq));

6663

-@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

6664

-  */

6665

- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6666

- {

6667

--	struct bfq_queue *bfqq, *new_bfqq = NULL;

6668

-+	struct bfq_queue *bfqq;

6669

- 	struct request *next_rq;

6670

- 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

6671

- 	int budg_timeout;

6672

-@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6673

-

6674

- 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

6675

-

6676

--	/*

6677

--         * If another queue has a request waiting within our mean seek

6678

--         * distance, let it run. The expire code will check for close

6679

--         * cooperators and put the close queue at the front of the

6680

--         * service tree. If possible, merge the expiring queue with the

6681

--         * new bfqq.

6682

--         */

6683

--        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

6684

--        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

6685

--                bfq_setup_merge(bfqq, new_bfqq);

6686

--

6687

- 	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

6688

- 	if (budg_timeout &&

6689

- 	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

6690

-@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6691

- 				bfq_clear_bfqq_wait_request(bfqq);

6692

- 				del_timer(&bfqd->idle_slice_timer);

6693

- 			}

6694

--			if (new_bfqq == NULL)

6695

--				goto keep_queue;

6696

--			else

6697

--				goto expire;

6698

-+			goto keep_queue;

6699

- 		}

6700

- 	}

6701

-

6702

-@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

6703

- 	 * queue still has requests in flight or is idling for a new request,

6704

- 	 * then keep it.

6705

- 	 */

6706

--	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

6707

-+	if (timer_pending(&bfqd->idle_slice_timer) ||

6708

- 	    (bfqq->dispatched != 0 &&

6709

- 	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

6710

--	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

6711

-+	     !bfq_queue_nonrot_noidle(bfqd, bfqq))) {

6712

- 		bfqq = NULL;

6713

- 		goto keep_queue;

6714

--	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

6715

--		/*

6716

--		 * Expiring the queue because there is a close cooperator,

6717

--		 * cancel timer.

6718

--		 */

6719

--		bfq_clear_bfqq_wait_request(bfqq);

6720

--		del_timer(&bfqd->idle_slice_timer);

6721

- 	}

6722

-

6723

- 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

6724

- expire:

6725

- 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

6726

- new_queue:

6727

--	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

6728

-+	bfqq = bfq_set_active_queue(bfqd);

6729

- 	bfq_log(bfqd, "select_queue: new queue %d returned",

6730

- 		bfqq != NULL ? bfqq->pid : 0);

6731

- keep_queue:

6732

-@@ -1617,9 +1807,8 @@ keep_queue:

6733

-

6734

- static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6735

- {

6736

-+	struct bfq_entity *entity = &bfqq->entity;

6737

- 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

6738

--		struct bfq_entity *entity = &bfqq->entity;

6739

--

6740

- 		bfq_log_bfqq(bfqd, bfqq,

6741

- 			"raising period dur %u/%u msec, "

6742

- 			"old raising coeff %u, w %d(%d)",

6743

-@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6744

- 					     jiffies_to_msecs(bfqq->

6745

- 						raising_cur_max_time));

6746

- 				bfq_bfqq_end_raising(bfqq);

6747

--				__bfq_entity_update_weight_prio(

6748

--					bfq_entity_service_tree(entity),

6749

--					entity);

6750

- 			}

6751

- 		}

6752

- 	}

6753

-+	/* Update weight both if it must be raised and if it must be lowered */

6754

-+	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

6755

-+		__bfq_entity_update_weight_prio(

6756

-+			bfq_entity_service_tree(entity),

6757

-+			entity);

6758

- }

6759

-

6760

- /*

6761

-@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)

6762

- 	struct bfq_io_cq *bic = icq_to_bic(icq);

6763

-

6764

- 	bic->ttime.last_end_request = jiffies;

6765

-+	/*

6766

-+	 * A newly created bic indicates that the process has just

6767

-+	 * started doing I/O, and is probably mapping into memory its

6768

-+	 * executable and libraries: it definitely needs weight raising.

6769

-+	 * There is however the possibility that the process performs,

6770

-+	 * for a while, I/O close to some other process. EQM intercepts

6771

-+	 * this behavior and may merge the queue corresponding to the

6772

-+	 * process  with some other queue, BEFORE the weight of the queue

6773

-+	 * is raised. Merged queues are not weight-raised (they are assumed

6774

-+	 * to belong to processes that benefit only from high throughput).

6775

-+	 * If the merge is basically the consequence of an accident, then

6776

-+	 * the queue will be split soon and will get back its old weight.

6777

-+	 * It is then important to write down somewhere that this queue

6778

-+	 * does need weight raising, even if it did not make it to get its

6779

-+	 * weight raised before being merged. To this purpose, we overload

6780

-+	 * the field raising_time_left and assign 1 to it, to mark the queue

6781

-+	 * as needing weight raising.

6782

-+	 */

6783

-+	bic->raising_time_left = 1;

6784

- }

6785

-

6786

- static void bfq_exit_icq(struct io_cq *icq)

6787

-@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)

6788

- 	}

6789

-

6790

- 	if (bic->bfqq[BLK_RW_SYNC]) {

6791

-+		/*

6792

-+		 * If the bic is using a shared queue, put the reference

6793

-+		 * taken on the io_context when the bic started using a

6794

-+		 * shared bfq_queue.

6795

-+		 */

6796

-+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

6797

-+			put_io_context(icq->ioc);

6798

- 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

6799

- 		bic->bfqq[BLK_RW_SYNC] = NULL;

6800

- 	}

6801

-@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

6802

- 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

6803

- 		return;

6804

-

6805

-+	/* Idle window just restored, statistics are meaningless. */

6806

-+	if (bfq_bfqq_just_split(bfqq))

6807

-+		return;

6808

-+

6809

- 	enable_idle = bfq_bfqq_idle_window(bfqq);

6810

-

6811

- 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

6812

-@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6813

- 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

6814

- 	    !BFQQ_SEEKY(bfqq))

6815

- 		bfq_update_idle_window(bfqd, bfqq, bic);

6816

-+	bfq_clear_bfqq_just_split(bfqq);

6817

-

6818

- 	bfq_log_bfqq(bfqd, bfqq,

6819

- 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

6820

-@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6821

- static void bfq_insert_request(struct request_queue *q, struct request *rq)

6822

- {

6823

- 	struct bfq_data *bfqd = q->elevator->elevator_data;

6824

--	struct bfq_queue *bfqq = RQ_BFQQ(rq);

6825

-+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

6826

-

6827

- 	assert_spin_locked(bfqd->queue->queue_lock);

6828

-+

6829

-+	/*

6830

-+	 * An unplug may trigger a requeue of a request from the device

6831

-+	 * driver: make sure we are in process context while trying to

6832

-+	 * merge two bfq_queues.

6833

-+	 */

6834

-+	if (!in_interrupt() &&

6835

-+	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {

6836

-+		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

6837

-+			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

6838

-+		/*

6839

-+		 * Release the request's reference to the old bfqq

6840

-+		 * and make sure one is taken to the shared queue.

6841

-+		 */

6842

-+		new_bfqq->allocated[rq_data_dir(rq)]++;

6843

-+		bfqq->allocated[rq_data_dir(rq)]--;

6844

-+		atomic_inc(&new_bfqq->ref);

6845

-+		bfq_put_queue(bfqq);

6846

-+		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

6847

-+			bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);

6848

-+		rq->elv.priv[1] = new_bfqq;

6849

-+		bfqq = new_bfqq;

6850

-+	}

6851

-+

6852

- 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

6853

-

6854

- 	bfq_add_rq_rb(rq);

6855

-

6856

-+	/*

6857

-+	 * Here a newly-created bfq_queue has already started a weight-raising

6858

-+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

6859

-+	 * from assigning it a full weight-raising period. See the detailed

6860

-+	 * comments about this field in bfq_init_icq().

6861

-+	 */

6862

-+	if (bfqq->bic != NULL)

6863

-+		bfqq->bic->raising_time_left = 0;

6864

- 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

6865

- 	list_add_tail(&rq->queuelist, &bfqq->fifo);

6866

-

6867

-@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)

6868

- 		if (bfq_bfqq_budget_new(bfqq))

6869

- 			bfq_set_budget_timeout(bfqd);

6870

-

6871

--		/* Idling is disabled also for cooperation issues:

6872

--		 * 1) there is a close cooperator for the queue, or

6873

--		 * 2) the queue is shared and some cooperator is likely

6874

--		 *    to be idle (in this case, by not arming the idle timer,

6875

--		 *    we try to slow down the queue, to prevent the zones

6876

--		 *    of the disk accessed by the active cooperators to become

6877

--		 *    too distant from the zone that will be accessed by the

6878

--		 *    currently idle cooperators)

6879

--		 */

6880

- 		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

6881

- 			bfq_arm_slice_timer(bfqd);

6882

- 		else if (budg_timeout)

6883

-@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)

6884

- 	}

6885

- }

6886

-

6887

--static struct bfq_queue *

6888

--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

6889

--                struct bfq_queue *bfqq)

6890

--{

6891

--        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

6892

--		(long unsigned)bfqq->new_bfqq->pid);

6893

--        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

6894

--        bfq_mark_bfqq_coop(bfqq->new_bfqq);

6895

--        bfq_put_queue(bfqq);

6896

--        return bic_to_bfqq(bic, 1);

6897

--}

6898

--

6899

- /*

6900

-  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

6901

-  * was the last process referring to said bfqq.

6902

-@@ -2469,6 +2702,9 @@ static struct bfq_queue *

6903

- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

6904

- {

6905

- 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

6906

-+

6907

-+	put_io_context(bic->icq.ioc);

6908

-+

6909

- 	if (bfqq_process_refs(bfqq) == 1) {

6910

- 		bfqq->pid = current->pid;

6911

- 		bfq_clear_bfqq_some_coop_idle(bfqq);

6912

-@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

6913

- 	struct bfq_queue *bfqq;

6914

- 	struct bfq_group *bfqg;

6915

- 	unsigned long flags;

6916

-+	bool split = false;

6917

-

6918

- 	might_sleep_if(gfp_mask & __GFP_WAIT);

6919

-

6920

-@@ -2516,24 +2753,14 @@ new_queue:

6921

- 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

6922

- 		bic_set_bfqq(bic, bfqq, is_sync);

6923

- 	} else {

6924

--		/*

6925

--		 * If the queue was seeky for too long, break it apart.

6926

--		 */

6927

-+		/* If the queue was seeky for too long, break it apart. */

6928

- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

6929

- 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

6930

- 			bfqq = bfq_split_bfqq(bic, bfqq);

6931

-+			split = true;

6932

- 			if (!bfqq)

6933

- 				goto new_queue;

6934

- 		}

6935

--

6936

--		/*

6937

--		 * Check to see if this queue is scheduled to merge with

6938

--		 * another closely cooperating queue. The merging of queues

6939

--		 * happens here as it must be done in process context.

6940

--		 * The reference on new_bfqq was taken in merge_bfqqs.

6941

--		 */

6942

--		if (bfqq->new_bfqq != NULL)

6943

--			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

6944

- 	}

6945

-

6946

- 	bfqq->allocated[rw]++;

6947

-@@ -2544,6 +2771,26 @@ new_queue:

6948

- 	rq->elv.priv[0] = bic;

6949

- 	rq->elv.priv[1] = bfqq;

6950

-

6951

-+	/*

6952

-+	 * If a bfq_queue has only one process reference, it is owned

6953

-+	 * by only one bfq_io_cq: we can set the bic field of the

6954

-+	 * bfq_queue to the address of that structure. Also, if the

6955

-+	 * queue has just been split, mark a flag so that the

6956

-+	 * information is available to the other scheduler hooks.

6957

-+	 */

6958

-+	if (bfqq_process_refs(bfqq) == 1) {

6959

-+		bfqq->bic = bic;

6960

-+		if (split) {

6961

-+			bfq_mark_bfqq_just_split(bfqq);

6962

-+			/*

6963

-+			 * If the queue has just been split from a shared queue,

6964

-+			 * restore the idle window and the possible weight

6965

-+			 * raising period.

6966

-+			 */

6967

-+			bfq_bfqq_resume_state(bfqq, bic);

6968

-+		}

6969

-+	}

6970

-+

6971

- 	spin_unlock_irqrestore(q->queue_lock, flags);

6972

-

6973

- 	return 0;

6974

-diff --git a/block/bfq-sched.c b/block/bfq-sched.c

6975

-index 03f8061..a0edaa2 100644

6976

---- a/block/bfq-sched.c

6977

-+++ b/block/bfq-sched.c

6978

-@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6979

- 	return bfqq;

6980

- }

6981

-

6982

--/*

6983

-- * Forced extraction of the given queue.

6984

-- */

6985

--static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

6986

--				      struct bfq_queue *bfqq)

6987

--{

6988

--	struct bfq_entity *entity;

6989

--	struct bfq_sched_data *sd;

6990

--

6991

--	BUG_ON(bfqd->active_queue != NULL);

6992

--

6993

--	entity = &bfqq->entity;

6994

--	/*

6995

--	 * Bubble up extraction/update from the leaf to the root.

6996

--	*/

6997

--	for_each_entity(entity) {

6998

--		sd = entity->sched_data;

6999

--		bfq_update_budget(entity);

7000

--		bfq_update_vtime(bfq_entity_service_tree(entity));

7001

--		bfq_active_extract(bfq_entity_service_tree(entity), entity);

7002

--		sd->active_entity = entity;

7003

--		sd->next_active = NULL;

7004

--		entity->service = 0;

7005

--	}

7006

--

7007

--	return;

7008

--}

7009

--

7010

- static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

7011

- {

7012

- 	if (bfqd->active_bic != NULL) {

7013

-diff --git a/block/bfq.h b/block/bfq.h

7014

-index 48ecde9..bb52975 100644

7015

---- a/block/bfq.h

7016

-+++ b/block/bfq.h

7017

-@@ -188,6 +188,8 @@ struct bfq_group;

7018

-  * @pid: pid of the process owning the queue, used for logging purposes.

7019

-  * @last_rais_start_time: last (idle -> weight-raised) transition attempt

7020

-  * @raising_cur_max_time: current max raising time for this queue

7021

-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

7022

-+ *	 queue is shared

7023

-  *

7024

-  * A bfq_queue is a leaf request queue; it can be associated to an io_context

7025

-  * or more (if it is an async one).  @cgroup holds a reference to the

7026

-@@ -231,6 +233,7 @@ struct bfq_queue {

7027

- 	sector_t last_request_pos;

7028

-

7029

- 	pid_t pid;

7030

-+	struct bfq_io_cq *bic;

7031

-

7032

- 	/* weight-raising fields */

7033

- 	unsigned int raising_cur_max_time;

7034

-@@ -257,12 +260,23 @@ struct bfq_ttime {

7035

-  * @icq: associated io_cq structure

7036

-  * @bfqq: array of two process queues, the sync and the async

7037

-  * @ttime: associated @bfq_ttime struct

7038

-+ * @raising_time_left: snapshot of the time left before weight raising ends

7039

-+ *		       for the sync queue associated to this process; this

7040

-+ *		       snapshot is taken to remember this value while the weight

7041

-+ *		       raising is suspended because the queue is merged with a

7042

-+ *		       shared queue, and is used to set @raising_cur_max_time

7043

-+ *		       when the queue is split from the shared queue and its

7044

-+ *		       weight is raised again

7045

-+ * @saved_idle_window: same purpose as the previous field for the idle window

7046

-  */

7047

- struct bfq_io_cq {

7048

- 	struct io_cq icq; /* must be the first member */

7049

- 	struct bfq_queue *bfqq[2];

7050

- 	struct bfq_ttime ttime;

7051

- 	int ioprio;

7052

-+

7053

-+	unsigned int raising_time_left;

7054

-+	unsigned int saved_idle_window;

7055

- };

7056

-

7057

- /**

7058

-@@ -403,6 +417,7 @@ enum bfqq_state_flags {

7059

- 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

7060

- 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

7061

- 	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

7062

-+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

7063

- };

7064

-

7065

- #define BFQ_BFQQ_FNS(name)						\

7066

-@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);

7067

- BFQ_BFQQ_FNS(coop);

7068

- BFQ_BFQQ_FNS(split_coop);

7069

- BFQ_BFQQ_FNS(some_coop_idle);

7070

-+BFQ_BFQQ_FNS(just_split);

7071

- #undef BFQ_BFQQ_FNS

7072

-

7073

- /* Logging facilities. */

7074

---

7075

-1.8.1.4

7076

-

7077

7078

Added: genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch

7079

===================================================================

7080

--- genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	                        (rev 0)

7081

+++ genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch	2013-11-04 10:09:31 UTC (rev 2565)

7082

@@ -0,0 +1,97 @@

7083

+From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001

7084

+From: Arianna Avanzini <avanzini.arianna@×××××.com>

7085

+Date: Tue, 3 Sep 2013 16:50:42 +0200

7086

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11

7087

+

7088

+Update Kconfig.iosched and do the related Makefile changes to include

7089

+kernel configuration options for BFQ. Also add the bfqio controller

7090

+to the cgroups subsystem.

7091

+

7092

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7093

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

7094

+---

7095

+ block/Kconfig.iosched         | 25 +++++++++++++++++++++++++

7096

+ block/Makefile                |  1 +

7097

+ include/linux/cgroup_subsys.h |  4 ++++

7098

+ 3 files changed, 30 insertions(+)

7099

+

7100

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

7101

+index 421bef9..695e064 100644

7102

+--- a/block/Kconfig.iosched

7103

++++ b/block/Kconfig.iosched

7104

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

7105

+ 	---help---

7106

+ 	  Enable group IO scheduling in CFQ.

7107

+

7108

++config IOSCHED_BFQ

7109

++	tristate "BFQ I/O scheduler"

7110

++	default n

7111

++	---help---

7112

++	  The BFQ I/O scheduler tries to distribute bandwidth among

7113

++	  all processes according to their weights.

7114

++	  It aims at distributing the bandwidth as desired, independently of

7115

++	  the disk parameters and with any workload. It also tries to

7116

++	  guarantee low latency to interactive and soft real-time

7117

++	  applications.  If compiled built-in (saying Y here), BFQ can

7118

++	  be configured to support hierarchical scheduling.

7119

++

7120

++config CGROUP_BFQIO

7121

++	bool "BFQ hierarchical scheduling support"

7122

++	depends on CGROUPS && IOSCHED_BFQ=y

7123

++	default n

7124

++	---help---

7125

++	  Enable hierarchical scheduling in BFQ, using the cgroups

7126

++	  filesystem interface.  The name of the subsystem will be

7127

++	  bfqio.

7128

++

7129

+ choice

7130

+ 	prompt "Default I/O scheduler"

7131

+ 	default DEFAULT_CFQ

7132

+@@ -52,6 +73,9 @@ choice

7133

+ 	config DEFAULT_CFQ

7134

+ 		bool "CFQ" if IOSCHED_CFQ=y

7135

+

7136

++	config DEFAULT_BFQ

7137

++		bool "BFQ" if IOSCHED_BFQ=y

7138

++

7139

+ 	config DEFAULT_NOOP

7140

+ 		bool "No-op"

7141

+

7142

+@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED

7143

+ 	string

7144

+ 	default "deadline" if DEFAULT_DEADLINE

7145

+ 	default "cfq" if DEFAULT_CFQ

7146

++	default "bfq" if DEFAULT_BFQ

7147

+ 	default "noop" if DEFAULT_NOOP

7148

+

7149

+ endmenu

7150

+diff --git a/block/Makefile b/block/Makefile

7151

+index 39b76ba..c0d20fa 100644

7152

+--- a/block/Makefile

7153

++++ b/block/Makefile

7154

+@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

7155

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

7156

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

7157

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

7158

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

7159

+

7160

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

7161

+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o

7162

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

7163

+index b613ffd..43c5dc9 100644

7164

+--- a/include/linux/cgroup_subsys.h

7165

++++ b/include/linux/cgroup_subsys.h

7166

+@@ -39,6 +39,10 @@ SUBSYS(net_cls)

7167

+ SUBSYS(blkio)

7168

+ #endif

7169

+

7170

++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)

7171

++SUBSYS(bfqio)

7172

++#endif

7173

++

7174

+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)

7175

+ SUBSYS(perf)

7176

+ #endif

7177

+--

7178

+1.8.1.4

7179

+

7180

7181

Added: genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1

7182

===================================================================

7183

--- genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	                        (rev 0)

7184

+++ genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1	2013-11-04 10:09:31 UTC (rev 2565)

7185

@@ -0,0 +1,5773 @@

7186

+From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001

7187

+From: Arianna Avanzini <avanzini.arianna@×××××.com>

7188

+Date: Thu, 9 May 2013 19:10:02 +0200

7189

+Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11

7190

+

7191

+Add the BFQ-v6r2 I/O scheduler to 3.11.

7192

+The general structure is borrowed from CFQ, as much code. A (bfq_)queue

7193

+is associated to each task doing I/O on a device, and each time a

7194

+scheduling decision has to be made a queue is selected and served until

7195

+it expires.

7196

+

7197

+    - Slices are given in the service domain: tasks are assigned

7198

+      budgets, measured in number of sectors. Once got the disk, a task

7199

+      must however consume its assigned budget within a configurable

7200

+      maximum time (by default, the maximum possible value of the

7201

+      budgets is automatically computed to comply with this timeout).

7202

+      This allows the desired latency vs "throughput boosting" tradeoff

7203

+      to be set.

7204

+

7205

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

7206

+      using an augmented rb-tree to take eligibility into account while

7207

+      preserving an O(log N) overall complexity.

7208

+

7209

+    - A low-latency tunable is provided; if enabled, both interactive

7210

+      and soft real-time applications are guaranteed very low latency.

7211

+

7212

+    - Latency guarantees are preserved also in presence of NCQ.

7213

+

7214

+    - Also with flash-based devices, a high throughput is achieved while

7215

+      still preserving latency guarantees.

7216

+

7217

+    - Useful features borrowed from CFQ: cooperating-queues merging (with

7218

+      some additional optimizations with respect to the original CFQ version),

7219

+      static fallback queue for OOM.

7220

+

7221

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

7222

+      interface.  Each node has a full scheduler, so each group can

7223

+      be assigned its own ioprio (mapped to a weight, see next point)

7224

+      and an ioprio_class.

7225

+

7226

+    - If the cgroups interface is used, weights can be explictly

7227

+      assigned, otherwise ioprio values are mapped to weights using the

7228

+      relation weight = IOPRIO_BE_NR - ioprio.

7229

+

7230

+    - ioprio classes are served in strict priority order, i.e., lower

7231

+      priority queues are not served as long as there are higher

7232

+      priority queues.  Among queues in the same class the bandwidth is

7233

+      distributed in proportion to the weight of each queue. A very

7234

+      thin extra bandwidth is however guaranteed to the Idle class, to

7235

+      prevent it from starving.

7236

+

7237

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7238

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

7239

+---

7240

+ block/bfq-cgroup.c  |  881 +++++++++++++++

7241

+ block/bfq-ioc.c     |   36 +

7242

+ block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++

7243

+ block/bfq-sched.c   | 1072 ++++++++++++++++++

7244

+ block/bfq.h         |  603 ++++++++++

7245

+ 5 files changed, 5674 insertions(+)

7246

+ create mode 100644 block/bfq-cgroup.c

7247

+ create mode 100644 block/bfq-ioc.c

7248

+ create mode 100644 block/bfq-iosched.c

7249

+ create mode 100644 block/bfq-sched.c

7250

+ create mode 100644 block/bfq.h

7251

+

7252

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

7253

+new file mode 100644

7254

+index 0000000..bb9b851

7255

+--- /dev/null

7256

++++ b/block/bfq-cgroup.c

7257

+@@ -0,0 +1,881 @@

7258

++/*

7259

++ * BFQ: CGROUPS support.

7260

++ *

7261

++ * Based on ideas and code from CFQ:

7262

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

7263

++ *

7264

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

7265

++ *		      Paolo Valente <paolo.valente@×××××××.it>

7266

++ *

7267

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

7268

++ *

7269

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

7270

++ */

7271

++

7272

++#ifdef CONFIG_CGROUP_BFQIO

7273

++

7274

++static DEFINE_MUTEX(bfqio_mutex);

7275

++

7276

++static bool bfqio_is_removed(struct cgroup *cgroup)

7277

++{

7278

++	return test_bit(CGRP_DEAD, &cgroup->flags);

7279

++}

7280

++

7281

++static struct bfqio_cgroup bfqio_root_cgroup = {

7282

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

7283

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

7284

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

7285

++};

7286

++

7287

++static inline void bfq_init_entity(struct bfq_entity *entity,

7288

++				   struct bfq_group *bfqg)

7289

++{

7290

++	entity->weight = entity->new_weight;

7291

++	entity->orig_weight = entity->new_weight;

7292

++	entity->ioprio = entity->new_ioprio;

7293

++	entity->ioprio_class = entity->new_ioprio_class;

7294

++	entity->parent = bfqg->my_entity;

7295

++	entity->sched_data = &bfqg->sched_data;

7296

++}

7297

++

7298

++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)

7299

++{

7300

++	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),

7301

++			    struct bfqio_cgroup, css);

7302

++}

7303

++

7304

++/*

7305

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

7306

++ * of bgrp.  Must be called under rcu_read_lock().

7307

++ */

7308

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

7309

++					    struct bfq_data *bfqd)

7310

++{

7311

++	struct bfq_group *bfqg;

7312

++	void *key;

7313

++

7314

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

7315

++		key = rcu_dereference(bfqg->bfqd);

7316

++		if (key == bfqd)

7317

++			return bfqg;

7318

++	}

7319

++

7320

++	return NULL;

7321

++}

7322

++

7323

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

7324

++					 struct bfq_group *bfqg)

7325

++{

7326

++	struct bfq_entity *entity = &bfqg->entity;

7327

++

7328

++	/*

7329

++	 * If the weight of the entity has never been set via the sysfs

7330

++	 * interface, then bgrp->weight == 0. In this case we initialize

7331

++	 * the weight from the current ioprio value. Otherwise, the group

7332

++	 * weight, if set, has priority over the ioprio value.

7333

++	 */

7334

++	if (bgrp->weight == 0) {

7335

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

7336

++		entity->new_ioprio = bgrp->ioprio;

7337

++	} else {

7338

++		entity->new_weight = bgrp->weight;

7339

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

7340

++	}

7341

++	entity->orig_weight = entity->weight = entity->new_weight;

7342

++	entity->ioprio = entity->new_ioprio;

7343

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

7344

++	entity->my_sched_data = &bfqg->sched_data;

7345

++}

7346

++

7347

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

7348

++					struct bfq_group *parent)

7349

++{

7350

++	struct bfq_entity *entity;

7351

++

7352

++	BUG_ON(parent == NULL);

7353

++	BUG_ON(bfqg == NULL);

7354

++

7355

++	entity = &bfqg->entity;

7356

++	entity->parent = parent->my_entity;

7357

++	entity->sched_data = &parent->sched_data;

7358

++}

7359

++

7360

++/**

7361

++ * bfq_group_chain_alloc - allocate a chain of groups.

7362

++ * @bfqd: queue descriptor.

7363

++ * @cgroup: the leaf cgroup this chain starts from.

7364

++ *

7365

++ * Allocate a chain of groups starting from the one belonging to

7366

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

7367

++ * to the root has already an allocated group on @bfqd.

7368

++ */

7369

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

7370

++					       struct cgroup *cgroup)

7371

++{

7372

++	struct bfqio_cgroup *bgrp;

7373

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

7374

++

7375

++	for (; cgroup != NULL; cgroup = cgroup->parent) {

7376

++		bgrp = cgroup_to_bfqio(cgroup);

7377

++

7378

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

7379

++		if (bfqg != NULL) {

7380

++			/*

7381

++			 * All the cgroups in the path from there to the

7382

++			 * root must have a bfq_group for bfqd, so we don't

7383

++			 * need any more allocations.

7384

++			 */

7385

++			break;

7386

++		}

7387

++

7388

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

7389

++		if (bfqg == NULL)

7390

++			goto cleanup;

7391

++

7392

++		bfq_group_init_entity(bgrp, bfqg);

7393

++		bfqg->my_entity = &bfqg->entity;

7394

++

7395

++		if (leaf == NULL) {

7396

++			leaf = bfqg;

7397

++			prev = leaf;

7398

++		} else {

7399

++			bfq_group_set_parent(prev, bfqg);

7400

++			/*

7401

++			 * Build a list of allocated nodes using the bfqd

7402

++			 * filed, that is still unused and will be initialized

7403

++			 * only after the node will be connected.

7404

++			 */

7405

++			prev->bfqd = bfqg;

7406

++			prev = bfqg;

7407

++		}

7408

++	}

7409

++

7410

++	return leaf;

7411

++

7412

++cleanup:

7413

++	while (leaf != NULL) {

7414

++		prev = leaf;

7415

++		leaf = leaf->bfqd;

7416

++		kfree(prev);

7417

++	}

7418

++

7419

++	return NULL;

7420

++}

7421

++

7422

++/**

7423

++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

7424

++ * @bfqd: the queue descriptor.

7425

++ * @cgroup: the leaf cgroup to start from.

7426

++ * @leaf: the leaf group (to be associated to @cgroup).

7427

++ *

7428

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

7429

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

7430

++ * hierarchy that already as a group associated to @bfqd all the nodes

7431

++ * in the path to the root cgroup have one too.

7432

++ *

7433

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

7434

++ * per device) while the bfqio_cgroup lock protects the list of groups

7435

++ * belonging to the same cgroup.

7436

++ */

7437

++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

7438

++				 struct bfq_group *leaf)

7439

++{

7440

++	struct bfqio_cgroup *bgrp;

7441

++	struct bfq_group *bfqg, *next, *prev = NULL;

7442

++	unsigned long flags;

7443

++

7444

++	assert_spin_locked(bfqd->queue->queue_lock);

7445

++

7446

++	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {

7447

++		bgrp = cgroup_to_bfqio(cgroup);

7448

++		next = leaf->bfqd;

7449

++

7450

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

7451

++		BUG_ON(bfqg != NULL);

7452

++

7453

++		spin_lock_irqsave(&bgrp->lock, flags);

7454

++

7455

++		rcu_assign_pointer(leaf->bfqd, bfqd);

7456

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

7457

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

7458

++

7459

++		spin_unlock_irqrestore(&bgrp->lock, flags);

7460

++

7461

++		prev = leaf;

7462

++		leaf = next;

7463

++	}

7464

++

7465

++	BUG_ON(cgroup == NULL && leaf != NULL);

7466

++	if (cgroup != NULL && prev != NULL) {

7467

++		bgrp = cgroup_to_bfqio(cgroup);

7468

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

7469

++		bfq_group_set_parent(prev, bfqg);

7470

++	}

7471

++}

7472

++

7473

++/**

7474

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

7475

++ * @bfqd: queue descriptor.

7476

++ * @cgroup: cgroup being searched for.

7477

++ *

7478

++ * Return a group associated to @bfqd in @cgroup, allocating one if

7479

++ * necessary.  When a group is returned all the cgroups in the path

7480

++ * to the root have a group associated to @bfqd.

7481

++ *

7482

++ * If the allocation fails, return the root group: this breaks guarantees

7483

++ * but is a safe fallbak.  If this loss becames a problem it can be

7484

++ * mitigated using the equivalent weight (given by the product of the

7485

++ * weights of the groups in the path from @group to the root) in the

7486

++ * root scheduler.

7487

++ *

7488

++ * We allocate all the missing nodes in the path from the leaf cgroup

7489

++ * to the root and we connect the nodes only after all the allocations

7490

++ * have been successful.

7491

++ */

7492

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

7493

++					      struct cgroup *cgroup)

7494

++{

7495

++	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

7496

++	struct bfq_group *bfqg;

7497

++

7498

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

7499

++	if (bfqg != NULL)

7500

++		return bfqg;

7501

++

7502

++	bfqg = bfq_group_chain_alloc(bfqd, cgroup);

7503

++	if (bfqg != NULL)

7504

++		bfq_group_chain_link(bfqd, cgroup, bfqg);

7505

++	else

7506

++		bfqg = bfqd->root_group;

7507

++

7508

++	return bfqg;

7509

++}

7510

++

7511

++/**

7512

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

7513

++ * @bfqd: queue descriptor.

7514

++ * @bfqq: the queue to move.

7515

++ * @entity: @bfqq's entity.

7516

++ * @bfqg: the group to move to.

7517

++ *

7518

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

7519

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

7520

++ *

7521

++ * Must be called under the queue lock; the cgroup owning @bfqg must

7522

++ * not disappear (by now this just means that we are called under

7523

++ * rcu_read_lock()).

7524

++ */

7525

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7526

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

7527

++{

7528

++	int busy, resume;

7529

++

7530

++	busy = bfq_bfqq_busy(bfqq);

7531

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

7532

++

7533

++	BUG_ON(resume && !entity->on_st);

7534

++	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);

7535

++

7536

++	if (busy) {

7537

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

7538

++

7539

++		if (!resume)

7540

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

7541

++		else

7542

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

7543

++	} else if (entity->on_st)

7544

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

7545

++

7546

++	/*

7547

++	 * Here we use a reference to bfqg.  We don't need a refcounter

7548

++	 * as the cgroup reference will not be dropped, so that its

7549

++	 * destroy() callback will not be invoked.

7550

++	 */

7551

++	entity->parent = bfqg->my_entity;

7552

++	entity->sched_data = &bfqg->sched_data;

7553

++

7554

++	if (busy && resume)

7555

++		bfq_activate_bfqq(bfqd, bfqq);

7556

++

7557

++	if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)

7558

++		bfq_schedule_dispatch(bfqd);

7559

++}

7560

++

7561

++/**

7562

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

7563

++ * @bfqd: the queue descriptor.

7564

++ * @bic: the bic to move.

7565

++ * @cgroup: the cgroup to move to.

7566

++ *

7567

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

7568

++ * has to make sure that the reference to cgroup is valid across the call.

7569

++ *

7570

++ * NOTE: an alternative approach might have been to store the current

7571

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

7572

++ * time here, at the price of slightly more complex code.

7573

++ */

7574

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

7575

++						 struct bfq_io_cq *bic,

7576

++						 struct cgroup *cgroup)

7577

++{

7578

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

7579

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

7580

++	struct bfq_entity *entity;

7581

++	struct bfq_group *bfqg;

7582

++	struct bfqio_cgroup *bgrp;

7583

++

7584

++	bgrp = cgroup_to_bfqio(cgroup);

7585

++

7586

++	bfqg = bfq_find_alloc_group(bfqd, cgroup);

7587

++	if (async_bfqq != NULL) {

7588

++		entity = &async_bfqq->entity;

7589

++

7590

++		if (entity->sched_data != &bfqg->sched_data) {

7591

++			bic_set_bfqq(bic, NULL, 0);

7592

++			bfq_log_bfqq(bfqd, async_bfqq,

7593

++				     "bic_change_group: %p %d",

7594

++				     async_bfqq, atomic_read(&async_bfqq->ref));

7595

++			bfq_put_queue(async_bfqq);

7596

++		}

7597

++	}

7598

++

7599

++	if (sync_bfqq != NULL) {

7600

++		entity = &sync_bfqq->entity;

7601

++		if (entity->sched_data != &bfqg->sched_data)

7602

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

7603

++	}

7604

++

7605

++	return bfqg;

7606

++}

7607

++

7608

++/**

7609

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

7610

++ * @bic: the bic being migrated.

7611

++ * @cgroup: the destination cgroup.

7612

++ *

7613

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

7614

++ * moved into its new parent group.

7615

++ */

7616

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

7617

++				  struct cgroup *cgroup)

7618

++{

7619

++	struct bfq_data *bfqd;

7620

++	unsigned long uninitialized_var(flags);

7621

++

7622

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

7623

++	if (bfqd != NULL) {

7624

++		__bfq_bic_change_cgroup(bfqd, bic, cgroup);

7625

++		bfq_put_bfqd_unlock(bfqd, &flags);

7626

++	}

7627

++}

7628

++

7629

++/**

7630

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

7631

++ * @bic: the @bic to update.

7632

++ *

7633

++ * Make sure that @bic is enqueued in the cgroup of the current task.

7634

++ * We need this in addition to moving bics during the cgroup attach

7635

++ * phase because the task owning @bic could be at its first disk

7636

++ * access or we may end up in the root cgroup as the result of a

7637

++ * memory allocation failure and here we try to move to the right

7638

++ * group.

7639

++ *

7640

++ * Must be called under the queue lock.  It is safe to use the returned

7641

++ * value even after the rcu_read_unlock() as the migration/destruction

7642

++ * paths act under the queue lock too.  IOW it is impossible to race with

7643

++ * group migration/destruction and end up with an invalid group as:

7644

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

7645

++ *      has started execution, as current holds a reference to it,

7646

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

7647

++ *      migrated to a different cgroup] its attach() callback will have

7648

++ *      taken care of remove all the references to the old cgroup data.

7649

++ */

7650

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

7651

++{

7652

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

7653

++	struct bfq_group *bfqg;

7654

++	struct cgroup *cgroup;

7655

++

7656

++	BUG_ON(bfqd == NULL);

7657

++

7658

++	rcu_read_lock();

7659

++	cgroup = task_cgroup(current, bfqio_subsys_id);

7660

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);

7661

++	rcu_read_unlock();

7662

++

7663

++	return bfqg;

7664

++}

7665

++

7666

++/**

7667

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

7668

++ * @st: the service tree being flushed.

7669

++ */

7670

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

7671

++{

7672

++	struct bfq_entity *entity = st->first_idle;

7673

++

7674

++	for (; entity != NULL; entity = st->first_idle)

7675

++		__bfq_deactivate_entity(entity, 0);

7676

++}

7677

++

7678

++/**

7679

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

7680

++ * @bfqd: the device data structure with the root group.

7681

++ * @entity: the entity to move.

7682

++ */

7683

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

7684

++					    struct bfq_entity *entity)

7685

++{

7686

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

7687

++

7688

++	BUG_ON(bfqq == NULL);

7689

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

7690

++	return;

7691

++}

7692

++

7693

++/**

7694

++ * bfq_reparent_active_entities - move to the root group all active entities.

7695

++ * @bfqd: the device data structure with the root group.

7696

++ * @bfqg: the group to move from.

7697

++ * @st: the service tree with the entities.

7698

++ *

7699

++ * Needs queue_lock to be taken and reference to be valid over the call.

7700

++ */

7701

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

7702

++						struct bfq_group *bfqg,

7703

++						struct bfq_service_tree *st)

7704

++{

7705

++	struct rb_root *active = &st->active;

7706

++	struct bfq_entity *entity = NULL;

7707

++

7708

++	if (!RB_EMPTY_ROOT(&st->active))

7709

++		entity = bfq_entity_of(rb_first(active));

7710

++

7711

++	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))

7712

++		bfq_reparent_leaf_entity(bfqd, entity);

7713

++

7714

++	if (bfqg->sched_data.active_entity != NULL)

7715

++		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);

7716

++

7717

++	return;

7718

++}

7719

++

7720

++/**

7721

++ * bfq_destroy_group - destroy @bfqg.

7722

++ * @bgrp: the bfqio_cgroup containing @bfqg.

7723

++ * @bfqg: the group being destroyed.

7724

++ *

7725

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

7726

++ */

7727

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

7728

++{

7729

++	struct bfq_data *bfqd;

7730

++	struct bfq_service_tree *st;

7731

++	struct bfq_entity *entity = bfqg->my_entity;

7732

++	unsigned long uninitialized_var(flags);

7733

++	int i;

7734

++

7735

++	hlist_del(&bfqg->group_node);

7736

++

7737

++	/*

7738

++	 * Empty all service_trees belonging to this group before deactivating

7739

++	 * the group itself.

7740

++	 */

7741

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

7742

++		st = bfqg->sched_data.service_tree + i;

7743

++

7744

++		/*

7745

++		 * The idle tree may still contain bfq_queues belonging

7746

++		 * to exited task because they never migrated to a different

7747

++		 * cgroup from the one being destroyed now.  Noone else

7748

++		 * can access them so it's safe to act without any lock.

7749

++		 */

7750

++		bfq_flush_idle_tree(st);

7751

++

7752

++		/*

7753

++		 * It may happen that some queues are still active

7754

++		 * (busy) upon group destruction (if the corresponding

7755

++		 * processes have been forced to terminate). We move

7756

++		 * all the leaf entities corresponding to these queues

7757

++		 * to the root_group.

7758

++		 * Also, it may happen that the group has an entity

7759

++		 * under service, which is disconnected from the active

7760

++		 * tree: it must be moved, too.

7761

++		 * There is no need to put the sync queues, as the

7762

++		 * scheduler has taken no reference.

7763

++		 */

7764

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

7765

++		if (bfqd != NULL) {

7766

++			bfq_reparent_active_entities(bfqd, bfqg, st);

7767

++			bfq_put_bfqd_unlock(bfqd, &flags);

7768

++		}

7769

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

7770

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

7771

++	}

7772

++	BUG_ON(bfqg->sched_data.next_active != NULL);

7773

++	BUG_ON(bfqg->sched_data.active_entity != NULL);

7774

++

7775

++	/*

7776

++	 * We may race with device destruction, take extra care when

7777

++	 * dereferencing bfqg->bfqd.

7778

++	 */

7779

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

7780

++	if (bfqd != NULL) {

7781

++		hlist_del(&bfqg->bfqd_node);

7782

++		__bfq_deactivate_entity(entity, 0);

7783

++		bfq_put_async_queues(bfqd, bfqg);

7784

++		bfq_put_bfqd_unlock(bfqd, &flags);

7785

++	}

7786

++	BUG_ON(entity->tree != NULL);

7787

++

7788

++	/*

7789

++	 * No need to defer the kfree() to the end of the RCU grace

7790

++	 * period: we are called from the destroy() callback of our

7791

++	 * cgroup, so we can be sure that noone is a) still using

7792

++	 * this cgroup or b) doing lookups in it.

7793

++	 */

7794

++	kfree(bfqg);

7795

++}

7796

++

7797

++static void bfq_end_raising_async(struct bfq_data *bfqd)

7798

++{

7799

++	struct hlist_node *tmp;

7800

++	struct bfq_group *bfqg;

7801

++

7802

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

7803

++		bfq_end_raising_async_queues(bfqd, bfqg);

7804

++}

7805

++

7806

++/**

7807

++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.

7808

++ * @bfqd: the device descriptor being exited.

7809

++ *

7810

++ * When the device exits we just make sure that no lookup can return

7811

++ * the now unused group structures.  They will be deallocated on cgroup

7812

++ * destruction.

7813

++ */

7814

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

7815

++{

7816

++	struct hlist_node *tmp;

7817

++	struct bfq_group *bfqg;

7818

++

7819

++	bfq_log(bfqd, "disconnect_groups beginning") ;

7820

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

7821

++		hlist_del(&bfqg->bfqd_node);

7822

++

7823

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

7824

++

7825

++		/*

7826

++		 * Don't remove from the group hash, just set an

7827

++		 * invalid key.  No lookups can race with the

7828

++		 * assignment as bfqd is being destroyed; this

7829

++		 * implies also that new elements cannot be added

7830

++		 * to the list.

7831

++		 */

7832

++		rcu_assign_pointer(bfqg->bfqd, NULL);

7833

++

7834

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

7835

++			bfqg) ;

7836

++		bfq_put_async_queues(bfqd, bfqg);

7837

++	}

7838

++}

7839

++

7840

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

7841

++{

7842

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

7843

++	struct bfq_group *bfqg = bfqd->root_group;

7844

++

7845

++	bfq_put_async_queues(bfqd, bfqg);

7846

++

7847

++	spin_lock_irq(&bgrp->lock);

7848

++	hlist_del_rcu(&bfqg->group_node);

7849

++	spin_unlock_irq(&bgrp->lock);

7850

++

7851

++	/*

7852

++	 * No need to synchronize_rcu() here: since the device is gone

7853

++	 * there cannot be any read-side access to its root_group.

7854

++	 */

7855

++	kfree(bfqg);

7856

++}

7857

++

7858

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

7859

++{

7860

++	struct bfq_group *bfqg;

7861

++	struct bfqio_cgroup *bgrp;

7862

++	int i;

7863

++

7864

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

7865

++	if (bfqg == NULL)

7866

++		return NULL;

7867

++

7868

++	bfqg->entity.parent = NULL;

7869

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

7870

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

7871

++

7872

++	bgrp = &bfqio_root_cgroup;

7873

++	spin_lock_irq(&bgrp->lock);

7874

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

7875

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

7876

++	spin_unlock_irq(&bgrp->lock);

7877

++

7878

++	return bfqg;

7879

++}

7880

++

7881

++#define SHOW_FUNCTION(__VAR)						\

7882

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\

7883

++				       struct cftype *cftype)		\

7884

++{									\

7885

++	struct bfqio_cgroup *bgrp;					\

7886

++	u64 ret = -ENODEV;						\

7887

++									\

7888

++	mutex_lock(&bfqio_mutex);					\

7889

++	if (bfqio_is_removed(cgroup))					\

7890

++		goto out_unlock;					\

7891

++									\

7892

++	bgrp = cgroup_to_bfqio(cgroup);					\

7893

++	spin_lock_irq(&bgrp->lock);					\

7894

++	ret = bgrp->__VAR;						\

7895

++	spin_unlock_irq(&bgrp->lock);					\

7896

++									\

7897

++out_unlock:								\

7898

++	mutex_unlock(&bfqio_mutex);					\

7899

++	return ret;							\

7900

++}

7901

++

7902

++SHOW_FUNCTION(weight);

7903

++SHOW_FUNCTION(ioprio);

7904

++SHOW_FUNCTION(ioprio_class);

7905

++#undef SHOW_FUNCTION

7906

++

7907

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

7908

++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

7909

++					struct cftype *cftype,		\

7910

++					u64 val)			\

7911

++{									\

7912

++	struct bfqio_cgroup *bgrp;					\

7913

++	struct bfq_group *bfqg;						\

7914

++	int ret = -EINVAL;						\

7915

++									\

7916

++	if (val < (__MIN) || val > (__MAX))				\

7917

++		return ret;						\

7918

++									\

7919

++	ret = -ENODEV;							\

7920

++	mutex_lock(&bfqio_mutex);					\

7921

++	if (bfqio_is_removed(cgroup))					\

7922

++		goto out_unlock;					\

7923

++	ret = 0;							\

7924

++									\

7925

++	bgrp = cgroup_to_bfqio(cgroup);					\

7926

++									\

7927

++	spin_lock_irq(&bgrp->lock);					\

7928

++	bgrp->__VAR = (unsigned short)val;				\

7929

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

7930

++		/*							\

7931

++                 * Setting the ioprio_changed flag of the entity        \

7932

++                 * to 1 with new_##__VAR == ##__VAR would re-set        \

7933

++                 * the value of the weight to its ioprio mapping.       \

7934

++                 * Set the flag only if necessary.                      \

7935

++                 */                                                     \

7936

++                if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

7937

++                        bfqg->entity.new_##__VAR = (unsigned short)val; \

7938

++                        smp_wmb();                                      \

7939

++                        bfqg->entity.ioprio_changed = 1;                \

7940

++                }							\

7941

++	}								\

7942

++	spin_unlock_irq(&bgrp->lock);					\

7943

++									\

7944

++out_unlock:								\

7945

++	mutex_unlock(&bfqio_mutex);					\

7946

++	return ret;							\

7947

++}

7948

++

7949

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

7950

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

7951

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

7952

++#undef STORE_FUNCTION

7953

++

7954

++static struct cftype bfqio_files[] = {

7955

++	{

7956

++		.name = "weight",

7957

++		.read_u64 = bfqio_cgroup_weight_read,

7958

++		.write_u64 = bfqio_cgroup_weight_write,

7959

++	},

7960

++	{

7961

++		.name = "ioprio",

7962

++		.read_u64 = bfqio_cgroup_ioprio_read,

7963

++		.write_u64 = bfqio_cgroup_ioprio_write,

7964

++	},

7965

++	{

7966

++		.name = "ioprio_class",

7967

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

7968

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

7969

++	},

7970

++	{ },	/* terminate */

7971

++};

7972

++

7973

++static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

7974

++{

7975

++	struct bfqio_cgroup *bgrp;

7976

++

7977

++	if (cgroup->parent != NULL) {

7978

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

7979

++		if (bgrp == NULL)

7980

++			return ERR_PTR(-ENOMEM);

7981

++	} else

7982

++		bgrp = &bfqio_root_cgroup;

7983

++

7984

++	spin_lock_init(&bgrp->lock);

7985

++	INIT_HLIST_HEAD(&bgrp->group_data);

7986

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

7987

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

7988

++

7989

++	return &bgrp->css;

7990

++}

7991

++

7992

++/*

7993

++ * We cannot support shared io contexts, as we have no means to support

7994

++ * two tasks with the same ioc in two different groups without major rework

7995

++ * of the main bic/bfqq data structures.  By now we allow a task to change

7996

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

7997

++ * behavior is that a group containing a task that forked using CLONE_IO

7998

++ * will not be destroyed until the tasks sharing the ioc die.

7999

++ */

8000

++static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

8001

++{

8002

++	struct task_struct *task;

8003

++	struct io_context *ioc;

8004

++	int ret = 0;

8005

++

8006

++	cgroup_taskset_for_each(task, cgroup, tset) {

8007

++		/* task_lock() is needed to avoid races with exit_io_context() */

8008

++		task_lock(task);

8009

++		ioc = task->io_context;

8010

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

8011

++			/*

8012

++			 * ioc == NULL means that the task is either too young or

8013

++			 * exiting: if it has still no ioc the ioc can't be shared,

8014

++			 * if the task is exiting the attach will fail anyway, no

8015

++			 * matter what we return here.

8016

++			 */

8017

++			ret = -EINVAL;

8018

++		task_unlock(task);

8019

++		if (ret)

8020

++			break;

8021

++	}

8022

++

8023

++	return ret;

8024

++}

8025

++

8026

++static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

8027

++{

8028

++	struct task_struct *task;

8029

++	struct io_context *ioc;

8030

++	struct io_cq *icq;

8031

++

8032

++	/*

8033

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

8034

++	 * new group has not yet been tested.

8035

++	 */

8036

++	cgroup_taskset_for_each(task, cgroup, tset) {

8037

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

8038

++		if (ioc) {

8039

++			/*

8040

++			 * Handle cgroup change here.

8041

++			 */

8042

++			rcu_read_lock();

8043

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

8044

++				if (!strncmp(icq->q->elevator->type->elevator_name,

8045

++					     "bfq", ELV_NAME_MAX))

8046

++					bfq_bic_change_cgroup(icq_to_bic(icq),

8047

++							      cgroup);

8048

++			rcu_read_unlock();

8049

++			put_io_context(ioc);

8050

++		}

8051

++	}

8052

++}

8053

++

8054

++static void bfqio_destroy(struct cgroup *cgroup)

8055

++{

8056

++	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

8057

++	struct hlist_node *tmp;

8058

++	struct bfq_group *bfqg;

8059

++

8060

++	/*

8061

++	 * Since we are destroying the cgroup, there are no more tasks

8062

++	 * referencing it, and all the RCU grace periods that may have

8063

++	 * referenced it are ended (as the destruction of the parent

8064

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

8065

++	 * anything else and we don't need any synchronization.

8066

++	 */

8067

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

8068

++		bfq_destroy_group(bgrp, bfqg);

8069

++

8070

++	BUG_ON(!hlist_empty(&bgrp->group_data));

8071

++

8072

++	kfree(bgrp);

8073

++}

8074

++

8075

++struct cgroup_subsys bfqio_subsys = {

8076

++	.name = "bfqio",

8077

++	.css_alloc = bfqio_create,

8078

++	.can_attach = bfqio_can_attach,

8079

++	.attach = bfqio_attach,

8080

++	.css_free = bfqio_destroy,

8081

++	.subsys_id = bfqio_subsys_id,

8082

++	.base_cftypes = bfqio_files,

8083

++};

8084

++#else

8085

++static inline void bfq_init_entity(struct bfq_entity *entity,

8086

++				   struct bfq_group *bfqg)

8087

++{

8088

++	entity->weight = entity->new_weight;

8089

++	entity->orig_weight = entity->new_weight;

8090

++	entity->ioprio = entity->new_ioprio;

8091

++	entity->ioprio_class = entity->new_ioprio_class;

8092

++	entity->sched_data = &bfqg->sched_data;

8093

++}

8094

++

8095

++static inline struct bfq_group *

8096

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

8097

++{

8098

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

8099

++	return bfqd->root_group;

8100

++}

8101

++

8102

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

8103

++				 struct bfq_queue *bfqq,

8104

++				 struct bfq_entity *entity,

8105

++				 struct bfq_group *bfqg)

8106

++{

8107

++}

8108

++

8109

++static void bfq_end_raising_async(struct bfq_data *bfqd)

8110

++{

8111

++	bfq_end_raising_async_queues(bfqd, bfqd->root_group);

8112

++}

8113

++

8114

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

8115

++{

8116

++	bfq_put_async_queues(bfqd, bfqd->root_group);

8117

++}

8118

++

8119

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

8120

++{

8121

++	kfree(bfqd->root_group);

8122

++}

8123

++

8124

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

8125

++{

8126

++	struct bfq_group *bfqg;

8127

++	int i;

8128

++

8129

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

8130

++	if (bfqg == NULL)

8131

++		return NULL;

8132

++

8133

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

8134

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

8135

++

8136

++	return bfqg;

8137

++}

8138

++#endif

8139

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

8140

+new file mode 100644

8141

+index 0000000..326e3ec

8142

+--- /dev/null

8143

++++ b/block/bfq-ioc.c

8144

+@@ -0,0 +1,36 @@

8145

++/*

8146

++ * BFQ: I/O context handling.

8147

++ *

8148

++ * Based on ideas and code from CFQ:

8149

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

8150

++ *

8151

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

8152

++ *		      Paolo Valente <paolo.valente@×××××××.it>

8153

++ *

8154

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

8155

++ */

8156

++

8157

++/**

8158

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

8159

++ * @icq: the iocontext queue.

8160

++ */

8161

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

8162

++{

8163

++	/* bic->icq is the first member, %NULL will convert to %NULL */

8164

++	return container_of(icq, struct bfq_io_cq, icq);

8165

++}

8166

++

8167

++/**

8168

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

8169

++ * @bfqd: the lookup key.

8170

++ * @ioc: the io_context of the process doing I/O.

8171

++ *

8172

++ * Queue lock must be held.

8173

++ */

8174

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

8175

++					       struct io_context *ioc)

8176

++{

8177

++	if(ioc)

8178

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

8179

++	return NULL;

8180

++}

8181

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

8182

+new file mode 100644

8183

+index 0000000..0ed2746

8184

+--- /dev/null

8185

++++ b/block/bfq-iosched.c

8186

+@@ -0,0 +1,3082 @@

8187

++/*

8188

++ * BFQ, or Budget Fair Queueing, disk scheduler.

8189

++ *

8190

++ * Based on ideas and code from CFQ:

8191

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

8192

++ *

8193

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

8194

++ *		      Paolo Valente <paolo.valente@×××××××.it>

8195

++ *

8196

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

8197

++ *

8198

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.

8199

++ *

8200

++ * BFQ is a proportional share disk scheduling algorithm based on the

8201

++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

8202

++ * measured in number of sectors, to tasks instead of time slices.

8203

++ * The disk is not granted to the active task for a given time slice,

8204

++ * but until it has exahusted its assigned budget.  This change from

8205

++ * the time to the service domain allows BFQ to distribute the disk

8206

++ * bandwidth among tasks as desired, without any distortion due to

8207

++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc

8208

++ * internal scheduler, called B-WF2Q+, to schedule tasks according to

8209

++ * their budgets.  Thanks to this accurate scheduler, BFQ can afford

8210

++ * to assign high budgets to disk-bound non-seeky tasks (to boost the

8211

++ * throughput), and yet guarantee low latencies to interactive and

8212

++ * soft real-time applications.

8213

++ *

8214

++ * BFQ has been introduced in [1], where the interested reader can

8215

++ * find an accurate description of the algorithm, the bandwidth

8216

++ * distribution and latency guarantees it provides, plus formal proofs

8217

++ * of all the properties.  With respect to the algorithm presented in

8218

++ * the paper, this implementation adds several little heuristics, and

8219

++ * a hierarchical extension, based on H-WF2Q+.

8220

++ *

8221

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

8222

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

8223

++ * complexity derives from the one introduced with EEVDF in [3].

8224

++ *

8225

++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling

8226

++ *     with Deterministic Guarantees on Bandwidth Distribution,'',

8227

++ *     IEEE Transactions on Computer, May 2010.

8228

++ *

8229

++ *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf

8230

++ *

8231

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

8232

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

8233

++ *     Oct 1997.

8234

++ *

8235

++ *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

8236

++ *

8237

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

8238

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

8239

++ *     Resource Allocation,'' technical report.

8240

++ *

8241

++ *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

8242

++ */

8243

++#include <linux/module.h>

8244

++#include <linux/slab.h>

8245

++#include <linux/blkdev.h>

8246

++#include <linux/cgroup.h>

8247

++#include <linux/elevator.h>

8248

++#include <linux/jiffies.h>

8249

++#include <linux/rbtree.h>

8250

++#include <linux/ioprio.h>

8251

++#include "bfq.h"

8252

++#include "blk.h"

8253

++

8254

++/* Max number of dispatches in one round of service. */

8255

++static const int bfq_quantum = 4;

8256

++

8257

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

8258

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

8259

++

8260

++/* Maximum backwards seek, in KiB. */

8261

++static const int bfq_back_max = 16 * 1024;

8262

++

8263

++/* Penalty of a backwards seek, in number of sectors. */

8264

++static const int bfq_back_penalty = 2;

8265

++

8266

++/* Idling period duration, in jiffies. */

8267

++static int bfq_slice_idle = HZ / 125;

8268

++

8269

++/* Default maximum budget values, in sectors and number of requests. */

8270

++static const int bfq_default_max_budget = 16 * 1024;

8271

++static const int bfq_max_budget_async_rq = 4;

8272

++

8273

++/*

8274

++ * Async to sync throughput distribution is controlled as follows:

8275

++ * when an async request is served, the entity is charged the number

8276

++ * of sectors of the request, multipled by the factor below

8277

++ */

8278

++static const int bfq_async_charge_factor = 10;

8279

++

8280

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

8281

++static const int bfq_timeout_sync = HZ / 8;

8282

++static int bfq_timeout_async = HZ / 25;

8283

++

8284

++struct kmem_cache *bfq_pool;

8285

++

8286

++/* Below this threshold (in ms), we consider thinktime immediate. */

8287

++#define BFQ_MIN_TT		2

8288

++

8289

++/* hw_tag detection: parallel requests threshold and min samples needed. */

8290

++#define BFQ_HW_QUEUE_THRESHOLD	4

8291

++#define BFQ_HW_QUEUE_SAMPLES	32

8292

++

8293

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

8294

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

8295

++

8296

++/* Min samples used for peak rate estimation (for autotuning). */

8297

++#define BFQ_PEAK_RATE_SAMPLES	32

8298

++

8299

++/* Shift used for peak rate fixed precision calculations. */

8300

++#define BFQ_RATE_SHIFT		16

8301

++

8302

++/*

8303

++ * The duration of the weight raising for interactive applications is

8304

++ * computed automatically (as default behaviour), using the following

8305

++ * formula: duration = (R / r) * T, where r is the peak rate of the

8306

++ * disk, and R and T are two reference parameters. In particular, R is

8307

++ * the peak rate of a reference disk, and T is about the maximum time

8308

++ * for starting popular large applications on that disk, under BFQ and

8309

++ * while reading two files in parallel. Finally, BFQ uses two

8310

++ * different pairs (R, T) depending on whether the disk is rotational

8311

++ * or non-rotational.

8312

++ */

8313

++#define T_rot			(msecs_to_jiffies(5500))

8314

++#define T_nonrot		(msecs_to_jiffies(2000))

8315

++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */

8316

++#define R_rot			17415

8317

++#define R_nonrot		34791

8318

++

8319

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

8320

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

8321

++

8322

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

8323

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

8324

++

8325

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

8326

++

8327

++#include "bfq-ioc.c"

8328

++#include "bfq-sched.c"

8329

++#include "bfq-cgroup.c"

8330

++

8331

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

8332

++				 IOPRIO_CLASS_IDLE)

8333

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

8334

++				 IOPRIO_CLASS_RT)

8335

++

8336

++#define bfq_sample_valid(samples)	((samples) > 80)

8337

++

8338

++/*

8339

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

8340

++ * set (in which case it could also be a direct WRITE).

8341

++ */

8342

++static inline int bfq_bio_sync(struct bio *bio)

8343

++{

8344

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

8345

++		return 1;

8346

++

8347

++	return 0;

8348

++}

8349

++

8350

++/*

8351

++ * Scheduler run of queue, if there are requests pending and no one in the

8352

++ * driver that will restart queueing.

8353

++ */

8354

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

8355

++{

8356

++	if (bfqd->queued != 0) {

8357

++		bfq_log(bfqd, "schedule dispatch");

8358

++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);

8359

++	}

8360

++}

8361

++

8362

++/*

8363

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

8364

++ * We choose the request that is closesr to the head right now.  Distance

8365

++ * behind the head is penalized and only allowed to a certain extent.

8366

++ */

8367

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

8368

++				      struct request *rq1,

8369

++				      struct request *rq2,

8370

++				      sector_t last)

8371

++{

8372

++	sector_t s1, s2, d1 = 0, d2 = 0;

8373

++	unsigned long back_max;

8374

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

8375

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

8376

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

8377

++

8378

++	if (rq1 == NULL || rq1 == rq2)

8379

++		return rq2;

8380

++	if (rq2 == NULL)

8381

++		return rq1;

8382

++

8383

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

8384

++		return rq1;

8385

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

8386

++		return rq2;

8387

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

8388

++		return rq1;

8389

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

8390

++		return rq2;

8391

++

8392

++	s1 = blk_rq_pos(rq1);

8393

++	s2 = blk_rq_pos(rq2);

8394

++

8395

++	/*

8396

++	 * By definition, 1KiB is 2 sectors.

8397

++	 */

8398

++	back_max = bfqd->bfq_back_max * 2;

8399

++

8400

++	/*

8401

++	 * Strict one way elevator _except_ in the case where we allow

8402

++	 * short backward seeks which are biased as twice the cost of a

8403

++	 * similar forward seek.

8404

++	 */

8405

++	if (s1 >= last)

8406

++		d1 = s1 - last;

8407

++	else if (s1 + back_max >= last)

8408

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

8409

++	else

8410

++		wrap |= BFQ_RQ1_WRAP;

8411

++

8412

++	if (s2 >= last)

8413

++		d2 = s2 - last;

8414

++	else if (s2 + back_max >= last)

8415

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

8416

++	else

8417

++		wrap |= BFQ_RQ2_WRAP;

8418

++

8419

++	/* Found required data */

8420

++

8421

++	/*

8422

++	 * By doing switch() on the bit mask "wrap" we avoid having to

8423

++	 * check two variables for all permutations: --> faster!

8424

++	 */

8425

++	switch (wrap) {

8426

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

8427

++		if (d1 < d2)

8428

++			return rq1;

8429

++		else if (d2 < d1)

8430

++			return rq2;

8431

++		else {

8432

++			if (s1 >= s2)

8433

++				return rq1;

8434

++			else

8435

++				return rq2;

8436

++		}

8437

++

8438

++	case BFQ_RQ2_WRAP:

8439

++		return rq1;

8440

++	case BFQ_RQ1_WRAP:

8441

++		return rq2;

8442

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

8443

++	default:

8444

++		/*

8445

++		 * Since both rqs are wrapped,

8446

++		 * start with the one that's further behind head

8447

++		 * (--> only *one* back seek required),

8448

++		 * since back seek takes more time than forward.

8449

++		 */

8450

++		if (s1 <= s2)

8451

++			return rq1;

8452

++		else

8453

++			return rq2;

8454

++	}

8455

++}

8456

++

8457

++static struct bfq_queue *

8458

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

8459

++		     sector_t sector, struct rb_node **ret_parent,

8460

++		     struct rb_node ***rb_link)

8461

++{

8462

++	struct rb_node **p, *parent;

8463

++	struct bfq_queue *bfqq = NULL;

8464

++

8465

++	parent = NULL;

8466

++	p = &root->rb_node;

8467

++	while (*p) {

8468

++		struct rb_node **n;

8469

++

8470

++		parent = *p;

8471

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

8472

++

8473

++		/*

8474

++		 * Sort strictly based on sector. Smallest to the left,

8475

++		 * largest to the right.

8476

++		 */

8477

++		if (sector > blk_rq_pos(bfqq->next_rq))

8478

++			n = &(*p)->rb_right;

8479

++		else if (sector < blk_rq_pos(bfqq->next_rq))

8480

++			n = &(*p)->rb_left;

8481

++		else

8482

++			break;

8483

++		p = n;

8484

++		bfqq = NULL;

8485

++	}

8486

++

8487

++	*ret_parent = parent;

8488

++	if (rb_link)

8489

++		*rb_link = p;

8490

++

8491

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

8492

++		(long long unsigned)sector,

8493

++		bfqq != NULL ? bfqq->pid : 0);

8494

++

8495

++	return bfqq;

8496

++}

8497

++

8498

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

8499

++{

8500

++	struct rb_node **p, *parent;

8501

++	struct bfq_queue *__bfqq;

8502

++

8503

++	if (bfqq->pos_root != NULL) {

8504

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

8505

++		bfqq->pos_root = NULL;

8506

++	}

8507

++

8508

++	if (bfq_class_idle(bfqq))

8509

++		return;

8510

++	if (!bfqq->next_rq)

8511

++		return;

8512

++

8513

++	bfqq->pos_root = &bfqd->rq_pos_tree;

8514

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

8515

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

8516

++	if (__bfqq == NULL) {

8517

++		rb_link_node(&bfqq->pos_node, parent, p);

8518

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

8519

++	} else

8520

++		bfqq->pos_root = NULL;

8521

++}

8522

++

8523

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

8524

++					struct bfq_queue *bfqq,

8525

++					struct request *last)

8526

++{

8527

++	struct rb_node *rbnext = rb_next(&last->rb_node);

8528

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

8529

++	struct request *next = NULL, *prev = NULL;

8530

++

8531

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

8532

++

8533

++	if (rbprev != NULL)

8534

++		prev = rb_entry_rq(rbprev);

8535

++

8536

++	if (rbnext != NULL)

8537

++		next = rb_entry_rq(rbnext);

8538

++	else {

8539

++		rbnext = rb_first(&bfqq->sort_list);

8540

++		if (rbnext && rbnext != &last->rb_node)

8541

++			next = rb_entry_rq(rbnext);

8542

++	}

8543

++

8544

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

8545

++}

8546

++

8547

++static void bfq_del_rq_rb(struct request *rq)

8548

++{

8549

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8550

++	struct bfq_data *bfqd = bfqq->bfqd;

8551

++	const int sync = rq_is_sync(rq);

8552

++

8553

++	BUG_ON(bfqq->queued[sync] == 0);

8554

++	bfqq->queued[sync]--;

8555

++	bfqd->queued--;

8556

++

8557

++	elv_rb_del(&bfqq->sort_list, rq);

8558

++

8559

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

8560

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)

8561

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

8562

++		/*

8563

++		 * Remove queue from request-position tree as it is empty.

8564

++		 */

8565

++		if (bfqq->pos_root != NULL) {

8566

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

8567

++			bfqq->pos_root = NULL;

8568

++		}

8569

++	}

8570

++}

8571

++

8572

++/* see the definition of bfq_async_charge_factor for details */

8573

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

8574

++					       struct bfq_queue *bfqq)

8575

++{

8576

++	return blk_rq_sectors(rq) *

8577

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *

8578

++		bfq_async_charge_factor));

8579

++}

8580

++

8581

++/**

8582

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

8583

++ * @bfqd: the device data the queue belongs to.

8584

++ * @bfqq: the queue to update.

8585

++ *

8586

++ * If the first request of a queue changes we make sure that the queue

8587

++ * has enough budget to serve at least its first request (if the

8588

++ * request has grown).  We do this because if the queue has not enough

8589

++ * budget for its first request, it has to go through two dispatch

8590

++ * rounds to actually get it dispatched.

8591

++ */

8592

++static void bfq_updated_next_req(struct bfq_data *bfqd,

8593

++				 struct bfq_queue *bfqq)

8594

++{

8595

++	struct bfq_entity *entity = &bfqq->entity;

8596

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

8597

++	struct request *next_rq = bfqq->next_rq;

8598

++	unsigned long new_budget;

8599

++

8600

++	if (next_rq == NULL)

8601

++		return;

8602

++

8603

++	if (bfqq == bfqd->active_queue)

8604

++		/*

8605

++		 * In order not to break guarantees, budgets cannot be

8606

++		 * changed after an entity has been selected.

8607

++		 */

8608

++		return;

8609

++

8610

++	BUG_ON(entity->tree != &st->active);

8611

++	BUG_ON(entity == entity->sched_data->active_entity);

8612

++

8613

++	new_budget = max_t(unsigned long, bfqq->max_budget,

8614

++			   bfq_serv_to_charge(next_rq, bfqq));

8615

++	entity->budget = new_budget;

8616

++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);

8617

++	bfq_activate_bfqq(bfqd, bfqq);

8618

++}

8619

++

8620

++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

8621

++{

8622

++	u64 dur;

8623

++

8624

++	if (bfqd->bfq_raising_max_time > 0)

8625

++		return bfqd->bfq_raising_max_time;

8626

++

8627

++	dur = bfqd->RT_prod;

8628

++	do_div(dur, bfqd->peak_rate);

8629

++

8630

++	return dur;

8631

++}

8632

++

8633

++static void bfq_add_rq_rb(struct request *rq)

8634

++{

8635

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8636

++	struct bfq_entity *entity = &bfqq->entity;

8637

++	struct bfq_data *bfqd = bfqq->bfqd;

8638

++	struct request *next_rq, *prev;

8639

++	unsigned long old_raising_coeff = bfqq->raising_coeff;

8640

++	int idle_for_long_time = bfqq->budget_timeout +

8641

++		bfqd->bfq_raising_min_idle_time < jiffies;

8642

++

8643

++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));

8644

++	bfqq->queued[rq_is_sync(rq)]++;

8645

++	bfqd->queued++;

8646

++

8647

++	elv_rb_add(&bfqq->sort_list, rq);

8648

++

8649

++	/*

8650

++	 * Check if this request is a better next-serve candidate.

8651

++	 */

8652

++	prev = bfqq->next_rq;

8653

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

8654

++	BUG_ON(next_rq == NULL);

8655

++	bfqq->next_rq = next_rq;

8656

++

8657

++	/*

8658

++	 * Adjust priority tree position, if next_rq changes.

8659

++	 */

8660

++	if (prev != bfqq->next_rq)

8661

++		bfq_rq_pos_tree_add(bfqd, bfqq);

8662

++

8663

++	if (!bfq_bfqq_busy(bfqq)) {

8664

++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

8665

++			bfqq->soft_rt_next_start < jiffies;

8666

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

8667

++				       bfq_serv_to_charge(next_rq, bfqq));

8668

++

8669

++		if (! bfqd->low_latency)

8670

++			goto add_bfqq_busy;

8671

++

8672

++		/*

8673

++		 * If the queue is not being boosted and has been idle

8674

++		 * for enough time, start a weight-raising period

8675

++		 */

8676

++		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

8677

++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

8678

++			if (idle_for_long_time)

8679

++				bfqq->raising_cur_max_time =

8680

++					bfq_wrais_duration(bfqd);

8681

++			else

8682

++				bfqq->raising_cur_max_time =

8683

++					bfqd->bfq_raising_rt_max_time;

8684

++			bfq_log_bfqq(bfqd, bfqq,

8685

++				     "wrais starting at %llu msec,"

8686

++				     "rais_max_time %u",

8687

++				     bfqq->last_rais_start_finish,

8688

++				     jiffies_to_msecs(bfqq->

8689

++					raising_cur_max_time));

8690

++		} else if (old_raising_coeff > 1) {

8691

++			if (idle_for_long_time)

8692

++				bfqq->raising_cur_max_time =

8693

++					bfq_wrais_duration(bfqd);

8694

++			else if (bfqq->raising_cur_max_time ==

8695

++				 bfqd->bfq_raising_rt_max_time &&

8696

++				 !soft_rt) {

8697

++				bfqq->raising_coeff = 1;

8698

++				bfq_log_bfqq(bfqd, bfqq,

8699

++					     "wrais ending at %llu msec,"

8700

++					     "rais_max_time %u",

8701

++					     bfqq->last_rais_start_finish,

8702

++					     jiffies_to_msecs(bfqq->

8703

++						raising_cur_max_time));

8704

++				}

8705

++		}

8706

++		if (old_raising_coeff != bfqq->raising_coeff)

8707

++			entity->ioprio_changed = 1;

8708

++add_bfqq_busy:

8709

++		bfq_add_bfqq_busy(bfqd, bfqq);

8710

++        } else {

8711

++                if(bfqd->low_latency && old_raising_coeff == 1 &&

8712

++			!rq_is_sync(rq) &&

8713

++			bfqq->last_rais_start_finish +

8714

++                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {

8715

++                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;

8716

++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);

8717

++

8718

++			entity->ioprio_changed = 1;

8719

++			bfq_log_bfqq(bfqd, bfqq,

8720

++				     "non-idle wrais starting at %llu msec,"

8721

++				     "rais_max_time %u",

8722

++				     bfqq->last_rais_start_finish,

8723

++				     jiffies_to_msecs(bfqq->

8724

++					raising_cur_max_time));

8725

++                }

8726

++                bfq_updated_next_req(bfqd, bfqq);

8727

++	}

8728

++

8729

++	if(bfqd->low_latency &&

8730

++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||

8731

++		 idle_for_long_time))

8732

++		bfqq->last_rais_start_finish = jiffies;

8733

++}

8734

++

8735

++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)

8736

++{

8737

++	elv_rb_del(&bfqq->sort_list, rq);

8738

++	bfqq->queued[rq_is_sync(rq)]--;

8739

++	bfqq->bfqd->queued--;

8740

++	bfq_add_rq_rb(rq);

8741

++}

8742

++

8743

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

8744

++					  struct bio *bio)

8745

++{

8746

++	struct task_struct *tsk = current;

8747

++	struct bfq_io_cq *bic;

8748

++	struct bfq_queue *bfqq;

8749

++

8750

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

8751

++	if (bic == NULL)

8752

++		return NULL;

8753

++

8754

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

8755

++	if (bfqq != NULL) {

8756

++		sector_t sector = bio->bi_sector + bio_sectors(bio);

8757

++

8758

++		return elv_rb_find(&bfqq->sort_list, sector);

8759

++	}

8760

++

8761

++	return NULL;

8762

++}

8763

++

8764

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

8765

++{

8766

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8767

++

8768

++	bfqd->rq_in_driver++;

8769

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

8770

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

8771

++		(long long unsigned)bfqd->last_position);

8772

++}

8773

++

8774

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

8775

++{

8776

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8777

++

8778

++	WARN_ON(bfqd->rq_in_driver == 0);

8779

++	bfqd->rq_in_driver--;

8780

++}

8781

++

8782

++static void bfq_remove_request(struct request *rq)

8783

++{

8784

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8785

++	struct bfq_data *bfqd = bfqq->bfqd;

8786

++

8787

++	if (bfqq->next_rq == rq) {

8788

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

8789

++		bfq_updated_next_req(bfqd, bfqq);

8790

++	}

8791

++

8792

++	list_del_init(&rq->queuelist);

8793

++	bfq_del_rq_rb(rq);

8794

++

8795

++	if (rq->cmd_flags & REQ_META) {

8796

++		WARN_ON(bfqq->meta_pending == 0);

8797

++		bfqq->meta_pending--;

8798

++	}

8799

++}

8800

++

8801

++static int bfq_merge(struct request_queue *q, struct request **req,

8802

++		     struct bio *bio)

8803

++{

8804

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8805

++	struct request *__rq;

8806

++

8807

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

8808

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

8809

++		*req = __rq;

8810

++		return ELEVATOR_FRONT_MERGE;

8811

++	}

8812

++

8813

++	return ELEVATOR_NO_MERGE;

8814

++}

8815

++

8816

++static void bfq_merged_request(struct request_queue *q, struct request *req,

8817

++			       int type)

8818

++{

8819

++	if (type == ELEVATOR_FRONT_MERGE) {

8820

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

8821

++

8822

++		bfq_reposition_rq_rb(bfqq, req);

8823

++	}

8824

++}

8825

++

8826

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

8827

++				struct request *next)

8828

++{

8829

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8830

++

8831

++	/*

8832

++	 * Reposition in fifo if next is older than rq.

8833

++	 */

8834

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

8835

++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

8836

++		list_move(&rq->queuelist, &next->queuelist);

8837

++		rq_set_fifo_time(rq, rq_fifo_time(next));

8838

++	}

8839

++

8840

++	if (bfqq->next_rq == next)

8841

++		bfqq->next_rq = rq;

8842

++

8843

++	bfq_remove_request(next);

8844

++}

8845

++

8846

++/* Must be called with bfqq != NULL */

8847

++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)

8848

++{

8849

++	BUG_ON(bfqq == NULL);

8850

++	bfqq->raising_coeff = 1;

8851

++	bfqq->raising_cur_max_time = 0;

8852

++	/* Trigger a weight change on the next activation of the queue */

8853

++	bfqq->entity.ioprio_changed = 1;

8854

++}

8855

++

8856

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

8857

++					struct bfq_group *bfqg)

8858

++{

8859

++	int i, j;

8860

++

8861

++	for (i = 0; i < 2; i++)

8862

++		for (j = 0; j < IOPRIO_BE_NR; j++)

8863

++			if (bfqg->async_bfqq[i][j] != NULL)

8864

++				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);

8865

++	if (bfqg->async_idle_bfqq != NULL)

8866

++		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);

8867

++}

8868

++

8869

++static void bfq_end_raising(struct bfq_data *bfqd)

8870

++{

8871

++	struct bfq_queue *bfqq;

8872

++

8873

++	spin_lock_irq(bfqd->queue->queue_lock);

8874

++

8875

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

8876

++		bfq_bfqq_end_raising(bfqq);

8877

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

8878

++		bfq_bfqq_end_raising(bfqq);

8879

++	bfq_end_raising_async(bfqd);

8880

++

8881

++	spin_unlock_irq(bfqd->queue->queue_lock);

8882

++}

8883

++

8884

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

8885

++			   struct bio *bio)

8886

++{

8887

++	struct bfq_data *bfqd = q->elevator->elevator_data;

8888

++	struct bfq_io_cq *bic;

8889

++	struct bfq_queue *bfqq;

8890

++

8891

++	/*

8892

++	 * Disallow merge of a sync bio into an async request.

8893

++	 */

8894

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

8895

++		return 0;

8896

++

8897

++	/*

8898

++	 * Lookup the bfqq that this bio will be queued with. Allow

8899

++	 * merge only if rq is queued there.

8900

++	 * Queue lock is held here.

8901

++	 */

8902

++	bic = bfq_bic_lookup(bfqd, current->io_context);

8903

++	if (bic == NULL)

8904

++		return 0;

8905

++

8906

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

8907

++	return bfqq == RQ_BFQQ(rq);

8908

++}

8909

++

8910

++static void __bfq_set_active_queue(struct bfq_data *bfqd,

8911

++				   struct bfq_queue *bfqq)

8912

++{

8913

++	if (bfqq != NULL) {

8914

++		bfq_mark_bfqq_must_alloc(bfqq);

8915

++		bfq_mark_bfqq_budget_new(bfqq);

8916

++		bfq_clear_bfqq_fifo_expire(bfqq);

8917

++

8918

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

8919

++

8920

++		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

8921

++			     bfqq->entity.budget);

8922

++	}

8923

++

8924

++	bfqd->active_queue = bfqq;

8925

++}

8926

++

8927

++/*

8928

++ * Get and set a new active queue for service.

8929

++ */

8930

++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

8931

++					      struct bfq_queue *bfqq)

8932

++{

8933

++	if (!bfqq)

8934

++		bfqq = bfq_get_next_queue(bfqd);

8935

++	else

8936

++		bfq_get_next_queue_forced(bfqd, bfqq);

8937

++

8938

++	__bfq_set_active_queue(bfqd, bfqq);

8939

++	return bfqq;

8940

++}

8941

++

8942

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

8943

++					  struct request *rq)

8944

++{

8945

++	if (blk_rq_pos(rq) >= bfqd->last_position)

8946

++		return blk_rq_pos(rq) - bfqd->last_position;

8947

++	else

8948

++		return bfqd->last_position - blk_rq_pos(rq);

8949

++}

8950

++

8951

++/*

8952

++ * Return true if bfqq has no request pending and rq is close enough to

8953

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

8954

++ * bfqq->next_rq

8955

++ */

8956

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

8957

++{

8958

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

8959

++}

8960

++

8961

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

8962

++{

8963

++	struct rb_root *root = &bfqd->rq_pos_tree;

8964

++	struct rb_node *parent, *node;

8965

++	struct bfq_queue *__bfqq;

8966

++	sector_t sector = bfqd->last_position;

8967

++

8968

++	if (RB_EMPTY_ROOT(root))

8969

++		return NULL;

8970

++

8971

++	/*

8972

++	 * First, if we find a request starting at the end of the last

8973

++	 * request, choose it.

8974

++	 */

8975

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

8976

++	if (__bfqq != NULL)

8977

++		return __bfqq;

8978

++

8979

++	/*

8980

++	 * If the exact sector wasn't found, the parent of the NULL leaf

8981

++	 * will contain the closest sector (rq_pos_tree sorted by next_request

8982

++	 * position).

8983

++	 */

8984

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

8985

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8986

++		return __bfqq;

8987

++

8988

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

8989

++		node = rb_next(&__bfqq->pos_node);

8990

++	else

8991

++		node = rb_prev(&__bfqq->pos_node);

8992

++	if (node == NULL)

8993

++		return NULL;

8994

++

8995

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

8996

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

8997

++		return __bfqq;

8998

++

8999

++	return NULL;

9000

++}

9001

++

9002

++/*

9003

++ * bfqd - obvious

9004

++ * cur_bfqq - passed in so that we don't decide that the current queue

9005

++ *            is closely cooperating with itself.

9006

++ *

9007

++ * We are assuming that cur_bfqq has dispatched at least one request,

9008

++ * and that bfqd->last_position reflects a position on the disk associated

9009

++ * with the I/O issued by cur_bfqq.

9010

++ */

9011

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

9012

++					      struct bfq_queue *cur_bfqq)

9013

++{

9014

++	struct bfq_queue *bfqq;

9015

++

9016

++	if (bfq_class_idle(cur_bfqq))

9017

++		return NULL;

9018

++	if (!bfq_bfqq_sync(cur_bfqq))

9019

++		return NULL;

9020

++	if (BFQQ_SEEKY(cur_bfqq))

9021

++		return NULL;

9022

++

9023

++	/* If device has only one backlogged bfq_queue, don't search. */

9024

++	if (bfqd->busy_queues == 1)

9025

++		return NULL;

9026

++

9027

++	/*

9028

++	 * We should notice if some of the queues are cooperating, e.g.

9029

++	 * working closely on the same area of the disk. In that case,

9030

++	 * we can group them together and don't waste time idling.

9031

++	 */

9032

++	bfqq = bfqq_close(bfqd);

9033

++	if (bfqq == NULL || bfqq == cur_bfqq)

9034

++		return NULL;

9035

++

9036

++	/*

9037

++	 * Do not merge queues from different bfq_groups.

9038

++	*/

9039

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

9040

++		return NULL;

9041

++

9042

++	/*

9043

++	 * It only makes sense to merge sync queues.

9044

++	 */

9045

++	if (!bfq_bfqq_sync(bfqq))

9046

++		return NULL;

9047

++	if (BFQQ_SEEKY(bfqq))

9048

++		return NULL;

9049

++

9050

++	/*

9051

++	 * Do not merge queues of different priority classes.

9052

++	 */

9053

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

9054

++		return NULL;

9055

++

9056

++	return bfqq;

9057

++}

9058

++

9059

++/*

9060

++ * If enough samples have been computed, return the current max budget

9061

++ * stored in bfqd, which is dynamically updated according to the

9062

++ * estimated disk peak rate; otherwise return the default max budget

9063

++ */

9064

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

9065

++{

9066

++	if (bfqd->budgets_assigned < 194)

9067

++		return bfq_default_max_budget;

9068

++	else

9069

++		return bfqd->bfq_max_budget;

9070

++}

9071

++

9072

++/*

9073

++ * Return min budget, which is a fraction of the current or default

9074

++ * max budget (trying with 1/32)

9075

++ */

9076

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

9077

++{

9078

++	if (bfqd->budgets_assigned < 194)

9079

++		return bfq_default_max_budget / 32;

9080

++	else

9081

++		return bfqd->bfq_max_budget / 32;

9082

++}

9083

++

9084

++/*

9085

++ * Decides whether idling should be done for given device and

9086

++ * given active queue.

9087

++ */

9088

++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,

9089

++					   struct bfq_queue *active_bfqq)

9090

++{

9091

++	if (active_bfqq == NULL)

9092

++		return false;

9093

++	/*

9094

++	 * If device is SSD it has no seek penalty, disable idling; but

9095

++	 * do so only if:

9096

++	 * - device does not support queuing, otherwise we still have

9097

++	 *   a problem with sync vs async workloads;

9098

++	 * - the queue is not weight-raised, to preserve guarantees.

9099

++	 */

9100

++	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&

9101

++		active_bfqq->raising_coeff == 1);

9102

++}

9103

++

9104

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

9105

++{

9106

++	struct bfq_queue *bfqq = bfqd->active_queue;

9107

++	struct bfq_io_cq *bic;

9108

++	unsigned long sl;

9109

++

9110

++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

9111

++

9112

++	/* Tasks have exited, don't wait. */

9113

++	bic = bfqd->active_bic;

9114

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

9115

++		return;

9116

++

9117

++	bfq_mark_bfqq_wait_request(bfqq);

9118

++

9119

++	/*

9120

++	 * We don't want to idle for seeks, but we do want to allow

9121

++	 * fair distribution of slice time for a process doing back-to-back

9122

++	 * seeks. So allow a little bit of time for him to submit a new rq.

9123

++	 *

9124

++	 * To prevent processes with (partly) seeky workloads from

9125

++	 * being too ill-treated, grant them a small fraction of the

9126

++	 * assigned budget before reducing the waiting time to

9127

++	 * BFQ_MIN_TT. This happened to help reduce latency.

9128

++	 */

9129

++	sl = bfqd->bfq_slice_idle;

9130

++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&

9131

++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&

9132

++	    bfqq->raising_coeff == 1)

9133

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

9134

++	else if (bfqq->raising_coeff > 1)

9135

++		sl = sl * 3;

9136

++	bfqd->last_idling_start = ktime_get();

9137

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

9138

++	bfq_log(bfqd, "arm idle: %u/%u ms",

9139

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

9140

++}

9141

++

9142

++/*

9143

++ * Set the maximum time for the active queue to consume its

9144

++ * budget. This prevents seeky processes from lowering the disk

9145

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

9146

++ */

9147

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

9148

++{

9149

++	struct bfq_queue *bfqq = bfqd->active_queue;

9150

++	unsigned int timeout_coeff;

9151

++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)

9152

++		timeout_coeff = 1;

9153

++	else

9154

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

9155

++

9156

++	bfqd->last_budget_start = ktime_get();

9157

++

9158

++	bfq_clear_bfqq_budget_new(bfqq);

9159

++	bfqq->budget_timeout = jiffies +

9160

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

9161

++

9162

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

9163

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

9164

++		timeout_coeff));

9165

++}

9166

++

9167

++/*

9168

++ * Move request from internal lists to the request queue dispatch list.

9169

++ */

9170

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

9171

++{

9172

++	struct bfq_data *bfqd = q->elevator->elevator_data;

9173

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

9174

++

9175

++	bfq_remove_request(rq);

9176

++	bfqq->dispatched++;

9177

++	elv_dispatch_sort(q, rq);

9178

++

9179

++	if (bfq_bfqq_sync(bfqq))

9180

++		bfqd->sync_flight++;

9181

++}

9182

++

9183

++/*

9184

++ * Return expired entry, or NULL to just start from scratch in rbtree.

9185

++ */

9186

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

9187

++{

9188

++	struct request *rq = NULL;

9189

++

9190

++	if (bfq_bfqq_fifo_expire(bfqq))

9191

++		return NULL;

9192

++

9193

++	bfq_mark_bfqq_fifo_expire(bfqq);

9194

++

9195

++	if (list_empty(&bfqq->fifo))

9196

++		return NULL;

9197

++

9198

++	rq = rq_entry_fifo(bfqq->fifo.next);

9199

++

9200

++	if (time_before(jiffies, rq_fifo_time(rq)))

9201

++		return NULL;

9202

++

9203

++	return rq;

9204

++}

9205

++

9206

++/*

9207

++ * Must be called with the queue_lock held.

9208

++ */

9209

++static int bfqq_process_refs(struct bfq_queue *bfqq)

9210

++{

9211

++	int process_refs, io_refs;

9212

++

9213

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

9214

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

9215

++	BUG_ON(process_refs < 0);

9216

++	return process_refs;

9217

++}

9218

++

9219

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

9220

++{

9221

++	int process_refs, new_process_refs;

9222

++	struct bfq_queue *__bfqq;

9223

++

9224

++	/*

9225

++	 * If there are no process references on the new_bfqq, then it is

9226

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

9227

++	 * may have dropped their last reference (not just their last process

9228

++	 * reference).

9229

++	 */

9230

++	if (!bfqq_process_refs(new_bfqq))

9231

++		return;

9232

++

9233

++	/* Avoid a circular list and skip interim queue merges. */

9234

++	while ((__bfqq = new_bfqq->new_bfqq)) {

9235

++		if (__bfqq == bfqq)

9236

++			return;

9237

++		new_bfqq = __bfqq;

9238

++	}

9239

++

9240

++	process_refs = bfqq_process_refs(bfqq);

9241

++	new_process_refs = bfqq_process_refs(new_bfqq);

9242

++	/*

9243

++	 * If the process for the bfqq has gone away, there is no

9244

++	 * sense in merging the queues.

9245

++	 */

9246

++	if (process_refs == 0 || new_process_refs == 0)

9247

++		return;

9248

++

9249

++	/*

9250

++	 * Merge in the direction of the lesser amount of work.

9251

++	 */

9252

++	if (new_process_refs >= process_refs) {

9253

++		bfqq->new_bfqq = new_bfqq;

9254

++		atomic_add(process_refs, &new_bfqq->ref);

9255

++	} else {

9256

++		new_bfqq->new_bfqq = bfqq;

9257

++		atomic_add(new_process_refs, &bfqq->ref);

9258

++	}

9259

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

9260

++		new_bfqq->pid);

9261

++}

9262

++

9263

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

9264

++{

9265

++	struct bfq_entity *entity = &bfqq->entity;

9266

++	return entity->budget - entity->service;

9267

++}

9268

++

9269

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9270

++{

9271

++	BUG_ON(bfqq != bfqd->active_queue);

9272

++

9273

++	__bfq_bfqd_reset_active(bfqd);

9274

++

9275

++	/*

9276

++	 * If this bfqq is shared between multiple processes, check

9277

++	 * to make sure that those processes are still issuing I/Os

9278

++	 * within the mean seek distance. If not, it may be time to

9279

++	 * break the queues apart again.

9280

++	 */

9281

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

9282

++		bfq_mark_bfqq_split_coop(bfqq);

9283

++

9284

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

9285

++		/*

9286

++		 * overloading budget_timeout field to store when

9287

++		 * the queue remains with no backlog, used by

9288

++		 * the weight-raising mechanism

9289

++		 */

9290

++		bfqq->budget_timeout = jiffies ;

9291

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

9292

++	} else {

9293

++		bfq_activate_bfqq(bfqd, bfqq);

9294

++		/*

9295

++		 * Resort priority tree of potential close cooperators.

9296

++		 */

9297

++		bfq_rq_pos_tree_add(bfqd, bfqq);

9298

++	}

9299

++}

9300

++

9301

++/**

9302

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

9303

++ * @bfqd: device data.

9304

++ * @bfqq: queue to update.

9305

++ * @reason: reason for expiration.

9306

++ *

9307

++ * Handle the feedback on @bfqq budget.  See the body for detailed

9308

++ * comments.

9309

++ */

9310

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

9311

++				     struct bfq_queue *bfqq,

9312

++				     enum bfqq_expiration reason)

9313

++{

9314

++	struct request *next_rq;

9315

++	unsigned long budget, min_budget;

9316

++

9317

++	budget = bfqq->max_budget;

9318

++	min_budget = bfq_min_budget(bfqd);

9319

++

9320

++	BUG_ON(bfqq != bfqd->active_queue);

9321

++

9322

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

9323

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

9324

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

9325

++		budget, bfq_min_budget(bfqd));

9326

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

9327

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));

9328

++

9329

++	if (bfq_bfqq_sync(bfqq)) {

9330

++		switch (reason) {

9331

++		/*

9332

++		 * Caveat: in all the following cases we trade latency

9333

++		 * for throughput.

9334

++		 */

9335

++		case BFQ_BFQQ_TOO_IDLE:

9336

++			/*

9337

++			 * This is the only case where we may reduce

9338

++			 * the budget: if there is no requets of the

9339

++			 * process still waiting for completion, then

9340

++			 * we assume (tentatively) that the timer has

9341

++			 * expired because the batch of requests of

9342

++			 * the process could have been served with a

9343

++			 * smaller budget.  Hence, betting that

9344

++			 * process will behave in the same way when it

9345

++			 * becomes backlogged again, we reduce its

9346

++			 * next budget.  As long as we guess right,

9347

++			 * this budget cut reduces the latency

9348

++			 * experienced by the process.

9349

++			 *

9350

++			 * However, if there are still outstanding

9351

++			 * requests, then the process may have not yet

9352

++			 * issued its next request just because it is

9353

++			 * still waiting for the completion of some of

9354

++			 * the still oustanding ones.  So in this

9355

++			 * subcase we do not reduce its budget, on the

9356

++			 * contrary we increase it to possibly boost

9357

++			 * the throughput, as discussed in the

9358

++			 * comments to the BUDGET_TIMEOUT case.

9359

++			 */

9360

++			if (bfqq->dispatched > 0) /* still oustanding reqs */

9361

++				budget = min(budget * 2, bfqd->bfq_max_budget);

9362

++			else {

9363

++				if (budget > 5 * min_budget)

9364

++					budget -= 4 * min_budget;

9365

++				else

9366

++					budget = min_budget;

9367

++			}

9368

++			break;

9369

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

9370

++			/*

9371

++			 * We double the budget here because: 1) it

9372

++			 * gives the chance to boost the throughput if

9373

++			 * this is not a seeky process (which may have

9374

++			 * bumped into this timeout because of, e.g.,

9375

++			 * ZBR), 2) together with charge_full_budget

9376

++			 * it helps give seeky processes higher

9377

++			 * timestamps, and hence be served less

9378

++			 * frequently.

9379

++			 */

9380

++			budget = min(budget * 2, bfqd->bfq_max_budget);

9381

++			break;

9382

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

9383

++			/*

9384

++			 * The process still has backlog, and did not

9385

++			 * let either the budget timeout or the disk

9386

++			 * idling timeout expire. Hence it is not

9387

++			 * seeky, has a short thinktime and may be

9388

++			 * happy with a higher budget too. So

9389

++			 * definitely increase the budget of this good

9390

++			 * candidate to boost the disk throughput.

9391

++			 */

9392

++			budget = min(budget * 4, bfqd->bfq_max_budget);

9393

++			break;

9394

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

9395

++		       /*

9396

++			* Leave the budget unchanged.

9397

++			*/

9398

++		default:

9399

++			return;

9400

++		}

9401

++	} else /* async queue */

9402

++	    /* async queues get always the maximum possible budget

9403

++	     * (their ability to dispatch is limited by

9404

++	     * @bfqd->bfq_max_budget_async_rq).

9405

++	     */

9406

++		budget = bfqd->bfq_max_budget;

9407

++

9408

++	bfqq->max_budget = budget;

9409

++

9410

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

9411

++	    bfqq->max_budget > bfqd->bfq_max_budget)

9412

++		bfqq->max_budget = bfqd->bfq_max_budget;

9413

++

9414

++	/*

9415

++	 * Make sure that we have enough budget for the next request.

9416

++	 * Since the finish time of the bfqq must be kept in sync with

9417

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

9418

++	 * update.

9419

++	 */

9420

++	next_rq = bfqq->next_rq;

9421

++	if (next_rq != NULL)

9422

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

9423

++					    bfq_serv_to_charge(next_rq, bfqq));

9424

++	else

9425

++		bfqq->entity.budget = bfqq->max_budget;

9426

++

9427

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

9428

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

9429

++			bfqq->entity.budget);

9430

++}

9431

++

9432

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

9433

++{

9434

++	unsigned long max_budget;

9435

++

9436

++	/*

9437

++	 * The max_budget calculated when autotuning is equal to the

9438

++	 * amount of sectors transfered in timeout_sync at the

9439

++	 * estimated peak rate.

9440

++	 */

9441

++	max_budget = (unsigned long)(peak_rate * 1000 *

9442

++				     timeout >> BFQ_RATE_SHIFT);

9443

++

9444

++	return max_budget;

9445

++}

9446

++

9447

++/*

9448

++ * In addition to updating the peak rate, checks whether the process

9449

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

9450

++ * to the budget timeout, to reduce the amount of service provided to

9451

++ * seeky processes, and hence reduce their chances to lower the

9452

++ * throughput. See the code for more details.

9453

++ */

9454

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

9455

++				int compensate, enum bfqq_expiration reason)

9456

++{

9457

++	u64 bw, usecs, expected, timeout;

9458

++	ktime_t delta;

9459

++	int update = 0;

9460

++

9461

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

9462

++		return 0;

9463

++

9464

++	if (compensate)

9465

++		delta = bfqd->last_idling_start;

9466

++	else

9467

++		delta = ktime_get();

9468

++	delta = ktime_sub(delta, bfqd->last_budget_start);

9469

++	usecs = ktime_to_us(delta);

9470

++

9471

++	/* Don't trust short/unrealistic values. */

9472

++	if (usecs < 100 || usecs >= LONG_MAX)

9473

++		return 0;

9474

++

9475

++	/*

9476

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

9477

++	 * value to store the peak rate, in sectors per usec in fixed

9478

++	 * point math.  We do so to have enough precision in the estimate

9479

++	 * and to avoid overflows.

9480

++	 */

9481

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

9482

++	do_div(bw, (unsigned long)usecs);

9483

++

9484

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

9485

++

9486

++	/*

9487

++	 * Use only long (> 20ms) intervals to filter out spikes for

9488

++	 * the peak rate estimation.

9489

++	 */

9490

++	if (usecs > 20000) {

9491

++		if (bw > bfqd->peak_rate ||

9492

++		   (!BFQQ_SEEKY(bfqq) &&

9493

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

9494

++			bfq_log(bfqd, "measured bw =%llu", bw);

9495

++			/*

9496

++			 * To smooth oscillations use a low-pass filter with

9497

++			 * alpha=7/8, i.e.,

9498

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

9499

++			 */

9500

++			do_div(bw, 8);

9501

++			if (bw == 0)

9502

++				return 0;

9503

++			bfqd->peak_rate *= 7;

9504

++			do_div(bfqd->peak_rate, 8);

9505

++			bfqd->peak_rate += bw;

9506

++			update = 1;

9507

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

9508

++		}

9509

++

9510

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

9511

++

9512

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

9513

++			bfqd->peak_rate_samples++;

9514

++

9515

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

9516

++		    update && bfqd->bfq_user_max_budget == 0) {

9517

++			bfqd->bfq_max_budget =

9518

++				bfq_calc_max_budget(bfqd->peak_rate, timeout);

9519

++			bfq_log(bfqd, "new max_budget=%lu",

9520

++				bfqd->bfq_max_budget);

9521

++		}

9522

++	}

9523

++

9524

++	/*

9525

++	 * If the process has been served for a too short time

9526

++	 * interval to let its possible sequential accesses prevail on

9527

++	 * the initial seek time needed to move the disk head on the

9528

++	 * first sector it requested, then give the process a chance

9529

++	 * and for the moment return false.

9530

++	 */

9531

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

9532

++		return 0;

9533

++

9534

++	/*

9535

++	 * A process is considered ``slow'' (i.e., seeky, so that we

9536

++	 * cannot treat it fairly in the service domain, as it would

9537

++	 * slow down too much the other processes) if, when a slice

9538

++	 * ends for whatever reason, it has received service at a

9539

++	 * rate that would not be high enough to complete the budget

9540

++	 * before the budget timeout expiration.

9541

++	 */

9542

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

9543

++

9544

++	/*

9545

++	 * Caveat: processes doing IO in the slower disk zones will

9546

++	 * tend to be slow(er) even if not seeky. And the estimated

9547

++	 * peak rate will actually be an average over the disk

9548

++	 * surface. Hence, to not be too harsh with unlucky processes,

9549

++	 * we keep a budget/3 margin of safety before declaring a

9550

++	 * process slow.

9551

++	 */

9552

++	return expected > (4 * bfqq->entity.budget) / 3;

9553

++}

9554

++

9555

++/**

9556

++ * bfq_bfqq_expire - expire a queue.

9557

++ * @bfqd: device owning the queue.

9558

++ * @bfqq: the queue to expire.

9559

++ * @compensate: if true, compensate for the time spent idling.

9560

++ * @reason: the reason causing the expiration.

9561

++ *

9562

++ *

9563

++ * If the process associated to the queue is slow (i.e., seeky), or in

9564

++ * case of budget timeout, or, finally, if it is async, we

9565

++ * artificially charge it an entire budget (independently of the

9566

++ * actual service it received). As a consequence, the queue will get

9567

++ * higher timestamps than the correct ones upon reactivation, and

9568

++ * hence it will be rescheduled as if it had received more service

9569

++ * than what it actually received. In the end, this class of processes

9570

++ * will receive less service in proportion to how slowly they consume

9571

++ * their budgets (and hence how seriously they tend to lower the

9572

++ * throughput).

9573

++ *

9574

++ * In contrast, when a queue expires because it has been idling for

9575

++ * too much or because it exhausted its budget, we do not touch the

9576

++ * amount of service it has received. Hence when the queue will be

9577

++ * reactivated and its timestamps updated, the latter will be in sync

9578

++ * with the actual service received by the queue until expiration.

9579

++ *

9580

++ * Charging a full budget to the first type of queues and the exact

9581

++ * service to the others has the effect of using the WF2Q+ policy to

9582

++ * schedule the former on a timeslice basis, without violating the

9583

++ * service domain guarantees of the latter.

9584

++ */

9585

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

9586

++			    struct bfq_queue *bfqq,

9587

++			    int compensate,

9588

++			    enum bfqq_expiration reason)

9589

++{

9590

++	int slow;

9591

++	BUG_ON(bfqq != bfqd->active_queue);

9592

++

9593

++	/* Update disk peak rate for autotuning and check whether the

9594

++	 * process is slow (see bfq_update_peak_rate).

9595

++	 */

9596

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

9597

++

9598

++	/*

9599

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

9600

++	 * and async queues, to favor sequential sync workloads.

9601

++	 *

9602

++	 * Processes doing IO in the slower disk zones will tend to be

9603

++	 * slow(er) even if not seeky. Hence, since the estimated peak

9604

++	 * rate is actually an average over the disk surface, these

9605

++	 * processes may timeout just for bad luck. To avoid punishing

9606

++	 * them we do not charge a full budget to a process that

9607

++	 * succeeded in consuming at least 2/3 of its budget.

9608

++	 */

9609

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

9610

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

9611

++		bfq_bfqq_charge_full_budget(bfqq);

9612

++

9613

++	if (bfqd->low_latency && bfqq->raising_coeff == 1)

9614

++		bfqq->last_rais_start_finish = jiffies;

9615

++

9616

++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {

9617

++	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)

9618

++		bfqq->soft_rt_next_start =

9619

++			jiffies +

9620

++			HZ * bfqq->entity.service /

9621

++			bfqd->bfq_raising_max_softrt_rate;

9622

++		else

9623

++			bfqq->soft_rt_next_start = -1; /* infinity */

9624

++	}

9625

++	bfq_log_bfqq(bfqd, bfqq,

9626

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,

9627

++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

9628

++

9629

++	/* Increase, decrease or leave budget unchanged according to reason */

9630

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

9631

++	__bfq_bfqq_expire(bfqd, bfqq);

9632

++}

9633

++

9634

++/*

9635

++ * Budget timeout is not implemented through a dedicated timer, but

9636

++ * just checked on request arrivals and completions, as well as on

9637

++ * idle timer expirations.

9638

++ */

9639

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

9640

++{

9641

++	if (bfq_bfqq_budget_new(bfqq))

9642

++		return 0;

9643

++

9644

++	if (time_before(jiffies, bfqq->budget_timeout))

9645

++		return 0;

9646

++

9647

++	return 1;

9648

++}

9649

++

9650

++/*

9651

++ * If we expire a queue that is waiting for the arrival of a new

9652

++ * request, we may prevent the fictitious timestamp backshifting that

9653

++ * allows the guarantees of the queue to be preserved (see [1] for

9654

++ * this tricky aspect). Hence we return true only if this condition

9655

++ * does not hold, or if the queue is slow enough to deserve only to be

9656

++ * kicked off for preserving a high throughput.

9657

++*/

9658

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

9659

++{

9660

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

9661

++		"may_budget_timeout: wr %d left %d timeout %d",

9662

++		bfq_bfqq_wait_request(bfqq),

9663

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

9664

++		bfq_bfqq_budget_timeout(bfqq));

9665

++

9666

++	return (!bfq_bfqq_wait_request(bfqq) ||

9667

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

9668

++		&&

9669

++		bfq_bfqq_budget_timeout(bfqq);

9670

++}

9671

++

9672

++/*

9673

++ * If the active queue is empty, but it is sync and either of the following

9674

++ * conditions holds, then: 1) the queue must remain active and cannot be

9675

++ * expired, and 2) the disk must be idled to wait for the possible arrival

9676

++ * of a new request for the queue. The conditions are:

9677

++ * - the device is rotational and not performing NCQ, and the queue has its

9678

++ *   idle window set (in this case, waiting for a new request for the queue

9679

++ *   is likely to boost the disk throughput);

9680

++ * - the queue is weight-raised (waiting for the request is necessary for

9681

++ *   providing the queue with fairness and latency guarantees).

9682

++ */

9683

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

9684

++				      int budg_timeout)

9685

++{

9686

++	struct bfq_data *bfqd = bfqq->bfqd;

9687

++

9688

++	return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&

9689

++		bfqd->bfq_slice_idle != 0 &&

9690

++		((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&

9691

++		  !blk_queue_nonrot(bfqd->queue))

9692

++		 || bfqq->raising_coeff > 1) &&

9693

++		(bfqd->rq_in_driver == 0 ||

9694

++				budg_timeout ||

9695

++                                bfqq->raising_coeff > 1) &&

9696

++                !bfq_close_cooperator(bfqd, bfqq) &&

9697

++                (!bfq_bfqq_coop(bfqq) ||

9698

++			!bfq_bfqq_some_coop_idle(bfqq)) &&

9699

++		!bfq_queue_nonrot_noidle(bfqd, bfqq));

9700

++}

9701

++

9702

++/*

9703

++ * Select a queue for service.  If we have a current active queue,

9704

++ * check whether to continue servicing it, or retrieve and set a new one.

9705

++ */

9706

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

9707

++{

9708

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

9709

++	struct request *next_rq;

9710

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

9711

++	int budg_timeout;

9712

++

9713

++	bfqq = bfqd->active_queue;

9714

++	if (bfqq == NULL)

9715

++		goto new_queue;

9716

++

9717

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

9718

++

9719

++	/*

9720

++         * If another queue has a request waiting within our mean seek

9721

++         * distance, let it run. The expire code will check for close

9722

++         * cooperators and put the close queue at the front of the

9723

++         * service tree. If possible, merge the expiring queue with the

9724

++         * new bfqq.

9725

++         */

9726

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

9727

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

9728

++                bfq_setup_merge(bfqq, new_bfqq);

9729

++

9730

++	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

9731

++	if (budg_timeout &&

9732

++	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

9733

++		goto expire;

9734

++

9735

++	next_rq = bfqq->next_rq;

9736

++	/*

9737

++	 * If bfqq has requests queued and it has enough budget left to

9738

++	 * serve them, keep the queue, otherwise expire it.

9739

++	 */

9740

++	if (next_rq != NULL) {

9741

++		if (bfq_serv_to_charge(next_rq, bfqq) >

9742

++			bfq_bfqq_budget_left(bfqq)) {

9743

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

9744

++			goto expire;

9745

++		} else {

9746

++			/*

9747

++			 * The idle timer may be pending because we may not

9748

++			 * disable disk idling even when a new request arrives

9749

++			 */

9750

++			if (timer_pending(&bfqd->idle_slice_timer)) {

9751

++				/*

9752

++				 * If we get here: 1) at least a new request

9753

++				 * has arrived but we have not disabled the

9754

++				 * timer because the request was too small,

9755

++				 * 2) then the block layer has unplugged the

9756

++				 * device, causing the dispatch to be invoked.

9757

++				 *

9758

++				 * Since the device is unplugged, now the

9759

++				 * requests are probably large enough to

9760

++				 * provide a reasonable throughput.

9761

++				 * So we disable idling.

9762

++				 */

9763

++				bfq_clear_bfqq_wait_request(bfqq);

9764

++				del_timer(&bfqd->idle_slice_timer);

9765

++			}

9766

++			if (new_bfqq == NULL)

9767

++				goto keep_queue;

9768

++			else

9769

++				goto expire;

9770

++		}

9771

++	}

9772

++

9773

++	/*

9774

++	 * No requests pending.  If there is no cooperator, and the active

9775

++	 * queue still has requests in flight or is idling for a new request,

9776

++	 * then keep it.

9777

++	 */

9778

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

9779

++	    (bfqq->dispatched != 0 &&

9780

++	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

9781

++	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

9782

++		bfqq = NULL;

9783

++		goto keep_queue;

9784

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

9785

++		/*

9786

++		 * Expiring the queue because there is a close cooperator,

9787

++		 * cancel timer.

9788

++		 */

9789

++		bfq_clear_bfqq_wait_request(bfqq);

9790

++		del_timer(&bfqd->idle_slice_timer);

9791

++	}

9792

++

9793

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

9794

++expire:

9795

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

9796

++new_queue:

9797

++	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

9798

++	bfq_log(bfqd, "select_queue: new queue %d returned",

9799

++		bfqq != NULL ? bfqq->pid : 0);

9800

++keep_queue:

9801

++	return bfqq;

9802

++}

9803

++

9804

++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

9805

++{

9806

++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

9807

++		struct bfq_entity *entity = &bfqq->entity;

9808

++

9809

++		bfq_log_bfqq(bfqd, bfqq,

9810

++			"raising period dur %u/%u msec, "

9811

++			"old raising coeff %u, w %d(%d)",

9812

++			jiffies_to_msecs(jiffies -

9813

++				bfqq->last_rais_start_finish),

9814

++			jiffies_to_msecs(bfqq->raising_cur_max_time),

9815

++			bfqq->raising_coeff,

9816

++			bfqq->entity.weight, bfqq->entity.orig_weight);

9817

++

9818

++		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=

9819

++			entity->orig_weight * bfqq->raising_coeff);

9820

++		if(entity->ioprio_changed)

9821

++			bfq_log_bfqq(bfqd, bfqq,

9822

++			"WARN: pending prio change");

9823

++		/*

9824

++		 * If too much time has elapsed from the beginning

9825

++		 * of this weight-raising period and process is not soft

9826

++		 * real-time, stop it

9827

++		 */

9828

++		if (jiffies - bfqq->last_rais_start_finish >

9829

++			bfqq->raising_cur_max_time) {

9830

++			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&

9831

++				bfqq->soft_rt_next_start < jiffies;

9832

++

9833

++			bfqq->last_rais_start_finish = jiffies;

9834

++			if (soft_rt)

9835

++				bfqq->raising_cur_max_time =

9836

++					bfqd->bfq_raising_rt_max_time;

9837

++			else {

9838

++				bfq_log_bfqq(bfqd, bfqq,

9839

++					     "wrais ending at %llu msec,"

9840

++					     "rais_max_time %u",

9841

++					     bfqq->last_rais_start_finish,

9842

++					     jiffies_to_msecs(bfqq->

9843

++						raising_cur_max_time));

9844

++				bfq_bfqq_end_raising(bfqq);

9845

++				__bfq_entity_update_weight_prio(

9846

++					bfq_entity_service_tree(entity),

9847

++					entity);

9848

++			}

9849

++		}

9850

++	}

9851

++}

9852

++

9853

++/*

9854

++ * Dispatch one request from bfqq, moving it to the request queue

9855

++ * dispatch list.

9856

++ */

9857

++static int bfq_dispatch_request(struct bfq_data *bfqd,

9858

++				struct bfq_queue *bfqq)

9859

++{

9860

++	int dispatched = 0;

9861

++	struct request *rq;

9862

++	unsigned long service_to_charge;

9863

++

9864

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

9865

++

9866

++	/* Follow expired path, else get first next available. */

9867

++	rq = bfq_check_fifo(bfqq);

9868

++	if (rq == NULL)

9869

++		rq = bfqq->next_rq;

9870

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

9871

++

9872

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

9873

++		/*

9874

++		 * This may happen if the next rq is chosen

9875

++		 * in fifo order instead of sector order.

9876

++		 * The budget is properly dimensioned

9877

++		 * to be always sufficient to serve the next request

9878

++		 * only if it is chosen in sector order. The reason is

9879

++		 * that it would be quite inefficient and little useful

9880

++		 * to always make sure that the budget is large enough

9881

++		 * to serve even the possible next rq in fifo order.

9882

++		 * In fact, requests are seldom served in fifo order.

9883

++		 *

9884

++		 * Expire the queue for budget exhaustion, and

9885

++		 * make sure that the next act_budget is enough

9886

++		 * to serve the next request, even if it comes

9887

++		 * from the fifo expired path.

9888

++		 */

9889

++		bfqq->next_rq = rq;

9890

++		/*

9891

++		 * Since this dispatch is failed, make sure that

9892

++		 * a new one will be performed

9893

++		 */

9894

++		if (!bfqd->rq_in_driver)

9895

++			bfq_schedule_dispatch(bfqd);

9896

++		goto expire;

9897

++	}

9898

++

9899

++	/* Finally, insert request into driver dispatch list. */

9900

++	bfq_bfqq_served(bfqq, service_to_charge);

9901

++	bfq_dispatch_insert(bfqd->queue, rq);

9902

++

9903

++	update_raising_data(bfqd, bfqq);

9904

++

9905

++	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "

9906

++			"budg left %lu",

9907

++			blk_rq_sectors(rq),

9908

++			(long long unsigned)blk_rq_pos(rq),

9909

++			bfq_bfqq_budget_left(bfqq));

9910

++

9911

++	dispatched++;

9912

++

9913

++	if (bfqd->active_bic == NULL) {

9914

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

9915

++		bfqd->active_bic = RQ_BIC(rq);

9916

++	}

9917

++

9918

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

9919

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

9920

++	    bfq_class_idle(bfqq)))

9921

++		goto expire;

9922

++

9923

++	return dispatched;

9924

++

9925

++expire:

9926

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

9927

++	return dispatched;

9928

++}

9929

++

9930

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

9931

++{

9932

++	int dispatched = 0;

9933

++

9934

++	while (bfqq->next_rq != NULL) {

9935

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

9936

++		dispatched++;

9937

++	}

9938

++

9939

++	BUG_ON(!list_empty(&bfqq->fifo));

9940

++	return dispatched;

9941

++}

9942

++

9943

++/*

9944

++ * Drain our current requests.  Used for barriers and when switching

9945

++ * io schedulers on-the-fly.

9946

++ */

9947

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

9948

++{

9949

++	struct bfq_queue *bfqq, *n;

9950

++	struct bfq_service_tree *st;

9951

++	int dispatched = 0;

9952

++

9953

++	bfqq = bfqd->active_queue;

9954

++	if (bfqq != NULL)

9955

++		__bfq_bfqq_expire(bfqd, bfqq);

9956

++

9957

++	/*

9958

++	 * Loop through classes, and be careful to leave the scheduler

9959

++	 * in a consistent state, as feedback mechanisms and vtime

9960

++	 * updates cannot be disabled during the process.

9961

++	 */

9962

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

9963

++		st = bfq_entity_service_tree(&bfqq->entity);

9964

++

9965

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

9966

++		bfqq->max_budget = bfq_max_budget(bfqd);

9967

++

9968

++		bfq_forget_idle(st);

9969

++	}

9970

++

9971

++	BUG_ON(bfqd->busy_queues != 0);

9972

++

9973

++	return dispatched;

9974

++}

9975

++

9976

++static int bfq_dispatch_requests(struct request_queue *q, int force)

9977

++{

9978

++	struct bfq_data *bfqd = q->elevator->elevator_data;

9979

++	struct bfq_queue *bfqq;

9980

++	int max_dispatch;

9981

++

9982

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

9983

++	if (bfqd->busy_queues == 0)

9984

++		return 0;

9985

++

9986

++	if (unlikely(force))

9987

++		return bfq_forced_dispatch(bfqd);

9988

++

9989

++	if((bfqq = bfq_select_queue(bfqd)) == NULL)

9990

++		return 0;

9991

++

9992

++	max_dispatch = bfqd->bfq_quantum;

9993

++	if (bfq_class_idle(bfqq))

9994

++		max_dispatch = 1;

9995

++

9996

++	if (!bfq_bfqq_sync(bfqq))

9997

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

9998

++

9999

++	if (bfqq->dispatched >= max_dispatch) {

10000

++		if (bfqd->busy_queues > 1)

10001

++			return 0;

10002

++		if (bfqq->dispatched >= 4 * max_dispatch)

10003

++			return 0;

10004

++	}

10005

++

10006

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

10007

++		return 0;

10008

++

10009

++	bfq_clear_bfqq_wait_request(bfqq);

10010

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

10011

++

10012

++	if (! bfq_dispatch_request(bfqd, bfqq))

10013

++		return 0;

10014

++

10015

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"

10016

++		     "(max_disp %d)", bfqq->pid, max_dispatch);

10017

++

10018

++	return 1;

10019

++}

10020

++

10021

++/*

10022

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

10023

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

10024

++ *

10025

++ * Queue lock must be held here.

10026

++ */

10027

++static void bfq_put_queue(struct bfq_queue *bfqq)

10028

++{

10029

++	struct bfq_data *bfqd = bfqq->bfqd;

10030

++

10031

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

10032

++

10033

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

10034

++		     atomic_read(&bfqq->ref));

10035

++	if (!atomic_dec_and_test(&bfqq->ref))

10036

++		return;

10037

++

10038

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

10039

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

10040

++	BUG_ON(bfqq->entity.tree != NULL);

10041

++	BUG_ON(bfq_bfqq_busy(bfqq));

10042

++	BUG_ON(bfqd->active_queue == bfqq);

10043

++

10044

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

10045

++

10046

++	kmem_cache_free(bfq_pool, bfqq);

10047

++}

10048

++

10049

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

10050

++{

10051

++	struct bfq_queue *__bfqq, *next;

10052

++

10053

++	/*

10054

++	 * If this queue was scheduled to merge with another queue, be

10055

++	 * sure to drop the reference taken on that queue (and others in

10056

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

10057

++	 */

10058

++	__bfqq = bfqq->new_bfqq;

10059

++	while (__bfqq) {

10060

++		if (__bfqq == bfqq) {

10061

++			WARN(1, "bfqq->new_bfqq loop detected.\n");

10062

++			break;

10063

++		}

10064

++		next = __bfqq->new_bfqq;

10065

++		bfq_put_queue(__bfqq);

10066

++		__bfqq = next;

10067

++	}

10068

++}

10069

++

10070

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

10071

++{

10072

++	if (bfqq == bfqd->active_queue) {

10073

++		__bfq_bfqq_expire(bfqd, bfqq);

10074

++		bfq_schedule_dispatch(bfqd);

10075

++	}

10076

++

10077

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

10078

++		     atomic_read(&bfqq->ref));

10079

++

10080

++	bfq_put_cooperator(bfqq);

10081

++

10082

++	bfq_put_queue(bfqq);

10083

++}

10084

++

10085

++static void bfq_init_icq(struct io_cq *icq)

10086

++{

10087

++	struct bfq_io_cq *bic = icq_to_bic(icq);

10088

++

10089

++	bic->ttime.last_end_request = jiffies;

10090

++}

10091

++

10092

++static void bfq_exit_icq(struct io_cq *icq)

10093

++{

10094

++	struct bfq_io_cq *bic = icq_to_bic(icq);

10095

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

10096

++

10097

++	if (bic->bfqq[BLK_RW_ASYNC]) {

10098

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

10099

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

10100

++	}

10101

++

10102

++	if (bic->bfqq[BLK_RW_SYNC]) {

10103

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

10104

++		bic->bfqq[BLK_RW_SYNC] = NULL;

10105

++	}

10106

++}

10107

++

10108

++/*

10109

++ * Update the entity prio values; note that the new values will not

10110

++ * be used until the next (re)activation.

10111

++ */

10112

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

10113

++{

10114

++	struct task_struct *tsk = current;

10115

++	int ioprio_class;

10116

++

10117

++	if (!bfq_bfqq_prio_changed(bfqq))

10118

++		return;

10119

++

10120

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

10121

++	switch (ioprio_class) {

10122

++	default:

10123

++		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);

10124

++	case IOPRIO_CLASS_NONE:

10125

++		/*

10126

++		 * No prio set, inherit CPU scheduling settings.

10127

++		 */

10128

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

10129

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

10130

++		break;

10131

++	case IOPRIO_CLASS_RT:

10132

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

10133

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

10134

++		break;

10135

++	case IOPRIO_CLASS_BE:

10136

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

10137

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

10138

++		break;

10139

++	case IOPRIO_CLASS_IDLE:

10140

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

10141

++		bfqq->entity.new_ioprio = 7;

10142

++		bfq_clear_bfqq_idle_window(bfqq);

10143

++		break;

10144

++	}

10145

++

10146

++	bfqq->entity.ioprio_changed = 1;

10147

++

10148

++	/*

10149

++	 * Keep track of original prio settings in case we have to temporarily

10150

++	 * elevate the priority of this queue.

10151

++	 */

10152

++	bfqq->org_ioprio = bfqq->entity.new_ioprio;

10153

++	bfq_clear_bfqq_prio_changed(bfqq);

10154

++}

10155

++

10156

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

10157

++{

10158

++	struct bfq_data *bfqd;

10159

++	struct bfq_queue *bfqq, *new_bfqq;

10160

++	struct bfq_group *bfqg;

10161

++	unsigned long uninitialized_var(flags);

10162

++	int ioprio = bic->icq.ioc->ioprio;

10163

++

10164

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

10165

++	/*

10166

++	 * This condition may trigger on a newly created bic, be sure to drop the

10167

++	 * lock before returning.

10168

++	 */

10169

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

10170

++		goto out;

10171

++

10172

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

10173

++	if (bfqq != NULL) {

10174

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

10175

++				    sched_data);

10176

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

10177

++					 GFP_ATOMIC);

10178

++		if (new_bfqq != NULL) {

10179

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

10180

++			bfq_log_bfqq(bfqd, bfqq,

10181

++				     "changed_ioprio: bfqq %p %d",

10182

++				     bfqq, atomic_read(&bfqq->ref));

10183

++			bfq_put_queue(bfqq);

10184

++		}

10185

++	}

10186

++

10187

++	bfqq = bic->bfqq[BLK_RW_SYNC];

10188

++	if (bfqq != NULL)

10189

++		bfq_mark_bfqq_prio_changed(bfqq);

10190

++

10191

++	bic->ioprio = ioprio;

10192

++

10193

++out:

10194

++	bfq_put_bfqd_unlock(bfqd, &flags);

10195

++}

10196

++

10197

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

10198

++			  pid_t pid, int is_sync)

10199

++{

10200

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

10201

++	INIT_LIST_HEAD(&bfqq->fifo);

10202

++

10203

++	atomic_set(&bfqq->ref, 0);

10204

++	bfqq->bfqd = bfqd;

10205

++

10206

++	bfq_mark_bfqq_prio_changed(bfqq);

10207

++

10208

++	if (is_sync) {

10209

++		if (!bfq_class_idle(bfqq))

10210

++			bfq_mark_bfqq_idle_window(bfqq);

10211

++		bfq_mark_bfqq_sync(bfqq);

10212

++	}

10213

++

10214

++	/* Tentative initial value to trade off between thr and lat */

10215

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

10216

++	bfqq->pid = pid;

10217

++

10218

++	bfqq->raising_coeff = 1;

10219

++	bfqq->last_rais_start_finish = 0;

10220

++	bfqq->soft_rt_next_start = -1;

10221

++}

10222

++

10223

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

10224

++					      struct bfq_group *bfqg,

10225

++					      int is_sync,

10226

++					      struct bfq_io_cq *bic,

10227

++					      gfp_t gfp_mask)

10228

++{

10229

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

10230

++

10231

++retry:

10232

++	/* bic always exists here */

10233

++	bfqq = bic_to_bfqq(bic, is_sync);

10234

++

10235

++	/*

10236

++	 * Always try a new alloc if we fall back to the OOM bfqq

10237

++	 * originally, since it should just be a temporary situation.

10238

++	 */

10239

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

10240

++		bfqq = NULL;

10241

++		if (new_bfqq != NULL) {

10242

++			bfqq = new_bfqq;

10243

++			new_bfqq = NULL;

10244

++		} else if (gfp_mask & __GFP_WAIT) {

10245

++			spin_unlock_irq(bfqd->queue->queue_lock);

10246

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

10247

++					gfp_mask | __GFP_ZERO,

10248

++					bfqd->queue->node);

10249

++			spin_lock_irq(bfqd->queue->queue_lock);

10250

++			if (new_bfqq != NULL)

10251

++				goto retry;

10252

++		} else {

10253

++			bfqq = kmem_cache_alloc_node(bfq_pool,

10254

++					gfp_mask | __GFP_ZERO,

10255

++					bfqd->queue->node);

10256

++		}

10257

++

10258

++		if (bfqq != NULL) {

10259

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

10260

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

10261

++		} else {

10262

++			bfqq = &bfqd->oom_bfqq;

10263

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

10264

++		}

10265

++

10266

++		bfq_init_prio_data(bfqq, bic);

10267

++		bfq_init_entity(&bfqq->entity, bfqg);

10268

++	}

10269

++

10270

++	if (new_bfqq != NULL)

10271

++		kmem_cache_free(bfq_pool, new_bfqq);

10272

++

10273

++	return bfqq;

10274

++}

10275

++

10276

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

10277

++					       struct bfq_group *bfqg,

10278

++					       int ioprio_class, int ioprio)

10279

++{

10280

++	switch (ioprio_class) {

10281

++	case IOPRIO_CLASS_RT:

10282

++		return &bfqg->async_bfqq[0][ioprio];

10283

++	case IOPRIO_CLASS_NONE:

10284

++		ioprio = IOPRIO_NORM;

10285

++		/* fall through */

10286

++	case IOPRIO_CLASS_BE:

10287

++		return &bfqg->async_bfqq[1][ioprio];

10288

++	case IOPRIO_CLASS_IDLE:

10289

++		return &bfqg->async_idle_bfqq;

10290

++	default:

10291

++		BUG();

10292

++	}

10293

++}

10294

++

10295

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

10296

++				       struct bfq_group *bfqg, int is_sync,

10297

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

10298

++{

10299

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

10300

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

10301

++	struct bfq_queue **async_bfqq = NULL;

10302

++	struct bfq_queue *bfqq = NULL;

10303

++

10304

++	if (!is_sync) {

10305

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

10306

++						  ioprio);

10307

++		bfqq = *async_bfqq;

10308

++	}

10309

++

10310

++	if (bfqq == NULL)

10311

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

10312

++

10313

++	/*

10314

++	 * Pin the queue now that it's allocated, scheduler exit will prune it.

10315

++	 */

10316

++	if (!is_sync && *async_bfqq == NULL) {

10317

++		atomic_inc(&bfqq->ref);

10318

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

10319

++			     bfqq, atomic_read(&bfqq->ref));

10320

++		*async_bfqq = bfqq;

10321

++	}

10322

++

10323

++	atomic_inc(&bfqq->ref);

10324

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

10325

++		     atomic_read(&bfqq->ref));

10326

++	return bfqq;

10327

++}

10328

++

10329

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

10330

++				    struct bfq_io_cq *bic)

10331

++{

10332

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

10333

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

10334

++

10335

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

10336

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

10337

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;

10338

++}

10339

++

10340

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

10341

++				   struct bfq_queue *bfqq,

10342

++				   struct request *rq)

10343

++{

10344

++	sector_t sdist;

10345

++	u64 total;

10346

++

10347

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

10348

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

10349

++	else

10350

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

10351

++

10352

++	/*

10353

++	 * Don't allow the seek distance to get too large from the

10354

++	 * odd fragment, pagein, etc.

10355

++	 */

10356

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

10357

++		sdist = 0;

10358

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

10359

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

10360

++	else

10361

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

10362

++

10363

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

10364

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

10365

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

10366

++	do_div(total, bfqq->seek_samples);

10367

++	if (bfq_bfqq_coop(bfqq)) {

10368

++		/*

10369

++		 * If the mean seektime increases for a (non-seeky) shared

10370

++		 * queue, some cooperator is likely to be idling too much.

10371

++		 * On the contrary,  if it decreases, some cooperator has

10372

++		 * probably waked up.

10373

++		 *

10374

++		 */

10375

++		if ((sector_t)total < bfqq->seek_mean)

10376

++			bfq_mark_bfqq_some_coop_idle(bfqq) ;

10377

++		else if ((sector_t)total > bfqq->seek_mean)

10378

++			bfq_clear_bfqq_some_coop_idle(bfqq) ;

10379

++	}

10380

++	bfqq->seek_mean = (sector_t)total;

10381

++

10382

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

10383

++			(u64)bfqq->seek_mean);

10384

++}

10385

++

10386

++/*

10387

++ * Disable idle window if the process thinks too long or seeks so much that

10388

++ * it doesn't matter.

10389

++ */

10390

++static void bfq_update_idle_window(struct bfq_data *bfqd,

10391

++				   struct bfq_queue *bfqq,

10392

++				   struct bfq_io_cq *bic)

10393

++{

10394

++	int enable_idle;

10395

++

10396

++	/* Don't idle for async or idle io prio class. */

10397

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

10398

++		return;

10399

++

10400

++	enable_idle = bfq_bfqq_idle_window(bfqq);

10401

++

10402

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

10403

++	    bfqd->bfq_slice_idle == 0 ||

10404

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

10405

++			bfqq->raising_coeff == 1))

10406

++		enable_idle = 0;

10407

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

10408

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

10409

++			bfqq->raising_coeff == 1)

10410

++			enable_idle = 0;

10411

++		else

10412

++			enable_idle = 1;

10413

++	}

10414

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

10415

++		enable_idle);

10416

++

10417

++	if (enable_idle)

10418

++		bfq_mark_bfqq_idle_window(bfqq);

10419

++	else

10420

++		bfq_clear_bfqq_idle_window(bfqq);

10421

++}

10422

++

10423

++/*

10424

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

10425

++ * something we should do about it.

10426

++ */

10427

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

10428

++			    struct request *rq)

10429

++{

10430

++	struct bfq_io_cq *bic = RQ_BIC(rq);

10431

++

10432

++	if (rq->cmd_flags & REQ_META)

10433

++		bfqq->meta_pending++;

10434

++

10435

++	bfq_update_io_thinktime(bfqd, bic);

10436

++	bfq_update_io_seektime(bfqd, bfqq, rq);

10437

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

10438

++	    !BFQQ_SEEKY(bfqq))

10439

++		bfq_update_idle_window(bfqd, bfqq, bic);

10440

++

10441

++	bfq_log_bfqq(bfqd, bfqq,

10442

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

10443

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

10444

++		     (long long unsigned)bfqq->seek_mean);

10445

++

10446

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

10447

++

10448

++	if (bfqq == bfqd->active_queue) {

10449

++		/*

10450

++		 * If there is just this request queued and the request

10451

++		 * is small, just exit.

10452

++		 * In this way, if the disk is being idled to wait for a new

10453

++		 * request from the active queue, we avoid unplugging the

10454

++		 * device now.

10455

++		 *

10456

++		 * By doing so, we spare the disk to be committed

10457

++		 * to serve just a small request. On the contrary, we wait for

10458

++		 * the block layer to decide when to unplug the device:

10459

++		 * hopefully, new requests will be merged to this

10460

++		 * one quickly, then the device will be unplugged

10461

++		 * and larger requests will be dispatched.

10462

++		 */

10463

++	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&

10464

++		    blk_rq_sectors(rq) < 32) {

10465

++		        return;

10466

++		}

10467

++		if (bfq_bfqq_wait_request(bfqq)) {

10468

++			/*

10469

++			 * If we are waiting for a request for this queue, let

10470

++			 * it rip immediately and flag that we must not expire

10471

++			 * this queue just now.

10472

++			 */

10473

++			bfq_clear_bfqq_wait_request(bfqq);

10474

++			del_timer(&bfqd->idle_slice_timer);

10475

++			/*

10476

++			 * Here we can safely expire the queue, in

10477

++			 * case of budget timeout, without wasting

10478

++			 * guarantees

10479

++			 */

10480

++			if (bfq_bfqq_budget_timeout(bfqq))

10481

++				bfq_bfqq_expire(bfqd, bfqq, 0,

10482

++						BFQ_BFQQ_BUDGET_TIMEOUT);

10483

++			__blk_run_queue(bfqd->queue);

10484

++		}

10485

++	}

10486

++}

10487

++

10488

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

10489

++{

10490

++	struct bfq_data *bfqd = q->elevator->elevator_data;

10491

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

10492

++

10493

++	assert_spin_locked(bfqd->queue->queue_lock);

10494

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

10495

++

10496

++	bfq_add_rq_rb(rq);

10497

++

10498

++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

10499

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

10500

++

10501

++	bfq_rq_enqueued(bfqd, bfqq, rq);

10502

++}

10503

++

10504

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

10505

++{

10506

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

10507

++				     bfqd->rq_in_driver);

10508

++

10509

++	if (bfqd->hw_tag == 1)

10510

++		return;

10511

++

10512

++	/*

10513

++	 * This sample is valid if the number of outstanding requests

10514

++	 * is large enough to allow a queueing behavior.  Note that the

10515

++	 * sum is not exact, as it's not taking into account deactivated

10516

++	 * requests.

10517

++	 */

10518

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

10519

++		return;

10520

++

10521

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

10522

++		return;

10523

++

10524

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

10525

++	bfqd->max_rq_in_driver = 0;

10526

++	bfqd->hw_tag_samples = 0;

10527

++}

10528

++

10529

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

10530

++{

10531

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

10532

++	struct bfq_data *bfqd = bfqq->bfqd;

10533

++	const int sync = rq_is_sync(rq);

10534

++

10535

++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",

10536

++			blk_rq_sectors(rq), sync);

10537

++

10538

++	bfq_update_hw_tag(bfqd);

10539

++

10540

++	WARN_ON(!bfqd->rq_in_driver);

10541

++	WARN_ON(!bfqq->dispatched);

10542

++	bfqd->rq_in_driver--;

10543

++	bfqq->dispatched--;

10544

++

10545

++	if (bfq_bfqq_sync(bfqq))

10546

++		bfqd->sync_flight--;

10547

++

10548

++	if (sync)

10549

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

10550

++

10551

++	/*

10552

++	 * If this is the active queue, check if it needs to be expired,

10553

++	 * or if we want to idle in case it has no pending requests.

10554

++	 */

10555

++	if (bfqd->active_queue == bfqq) {

10556

++		int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

10557

++		if (bfq_bfqq_budget_new(bfqq))

10558

++			bfq_set_budget_timeout(bfqd);

10559

++

10560

++		/* Idling is disabled also for cooperation issues:

10561

++		 * 1) there is a close cooperator for the queue, or

10562

++		 * 2) the queue is shared and some cooperator is likely

10563

++		 *    to be idle (in this case, by not arming the idle timer,

10564

++		 *    we try to slow down the queue, to prevent the zones

10565

++		 *    of the disk accessed by the active cooperators to become

10566

++		 *    too distant from the zone that will be accessed by the

10567

++		 *    currently idle cooperators)

10568

++		 */

10569

++		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

10570

++			bfq_arm_slice_timer(bfqd);

10571

++		else if (budg_timeout)

10572

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

10573

++	}

10574

++

10575

++	if (!bfqd->rq_in_driver)

10576

++		bfq_schedule_dispatch(bfqd);

10577

++}

10578

++

10579

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

10580

++{

10581

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

10582

++		bfq_clear_bfqq_must_alloc(bfqq);

10583

++		return ELV_MQUEUE_MUST;

10584

++	}

10585

++

10586

++	return ELV_MQUEUE_MAY;

10587

++}

10588

++

10589

++static int bfq_may_queue(struct request_queue *q, int rw)

10590

++{

10591

++	struct bfq_data *bfqd = q->elevator->elevator_data;

10592

++	struct task_struct *tsk = current;

10593

++	struct bfq_io_cq *bic;

10594

++	struct bfq_queue *bfqq;

10595

++

10596

++	/*

10597

++	 * Don't force setup of a queue from here, as a call to may_queue

10598

++	 * does not necessarily imply that a request actually will be queued.

10599

++	 * So just lookup a possibly existing queue, or return 'may queue'

10600

++	 * if that fails.

10601

++	 */

10602

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

10603

++	if (bic == NULL)

10604

++		return ELV_MQUEUE_MAY;

10605

++

10606

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

10607

++	if (bfqq != NULL) {

10608

++		bfq_init_prio_data(bfqq, bic);

10609

++

10610

++		return __bfq_may_queue(bfqq);

10611

++	}

10612

++

10613

++	return ELV_MQUEUE_MAY;

10614

++}

10615

++

10616

++/*

10617

++ * Queue lock held here.

10618

++ */

10619

++static void bfq_put_request(struct request *rq)

10620

++{

10621

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

10622

++

10623

++	if (bfqq != NULL) {

10624

++		const int rw = rq_data_dir(rq);

10625

++

10626

++		BUG_ON(!bfqq->allocated[rw]);

10627

++		bfqq->allocated[rw]--;

10628

++

10629

++		rq->elv.priv[0] = NULL;

10630

++		rq->elv.priv[1] = NULL;

10631

++

10632

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

10633

++			     bfqq, atomic_read(&bfqq->ref));

10634

++		bfq_put_queue(bfqq);

10635

++	}

10636

++}

10637

++

10638

++static struct bfq_queue *

10639

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

10640

++                struct bfq_queue *bfqq)

10641

++{

10642

++        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

10643

++		(long unsigned)bfqq->new_bfqq->pid);

10644

++        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

10645

++        bfq_mark_bfqq_coop(bfqq->new_bfqq);

10646

++        bfq_put_queue(bfqq);

10647

++        return bic_to_bfqq(bic, 1);

10648

++}

10649

++

10650

++/*

10651

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

10652

++ * was the last process referring to said bfqq.

10653

++ */

10654

++static struct bfq_queue *

10655

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

10656

++{

10657

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

10658

++	if (bfqq_process_refs(bfqq) == 1) {

10659

++		bfqq->pid = current->pid;

10660

++		bfq_clear_bfqq_some_coop_idle(bfqq);

10661

++		bfq_clear_bfqq_coop(bfqq);

10662

++		bfq_clear_bfqq_split_coop(bfqq);

10663

++		return bfqq;

10664

++	}

10665

++

10666

++	bic_set_bfqq(bic, NULL, 1);

10667

++

10668

++	bfq_put_cooperator(bfqq);

10669

++

10670

++	bfq_put_queue(bfqq);

10671

++	return NULL;

10672

++}

10673

++

10674

++/*

10675

++ * Allocate bfq data structures associated with this request.

10676

++ */

10677

++static int bfq_set_request(struct request_queue *q, struct request *rq,

10678

++			   struct bio *bio, gfp_t gfp_mask)

10679

++{

10680

++	struct bfq_data *bfqd = q->elevator->elevator_data;

10681

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

10682

++	const int rw = rq_data_dir(rq);

10683

++	const int is_sync = rq_is_sync(rq);

10684

++	struct bfq_queue *bfqq;

10685

++	struct bfq_group *bfqg;

10686

++	unsigned long flags;

10687

++

10688

++	might_sleep_if(gfp_mask & __GFP_WAIT);

10689

++

10690

++	bfq_changed_ioprio(bic);

10691

++

10692

++	spin_lock_irqsave(q->queue_lock, flags);

10693

++

10694

++	if (bic == NULL)

10695

++		goto queue_fail;

10696

++

10697

++	bfqg = bfq_bic_update_cgroup(bic);

10698

++

10699

++new_queue:

10700

++	bfqq = bic_to_bfqq(bic, is_sync);

10701

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

10702

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

10703

++		bic_set_bfqq(bic, bfqq, is_sync);

10704

++	} else {

10705

++		/*

10706

++		 * If the queue was seeky for too long, break it apart.

10707

++		 */

10708

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

10709

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

10710

++			bfqq = bfq_split_bfqq(bic, bfqq);

10711

++			if (!bfqq)

10712

++				goto new_queue;

10713

++		}

10714

++

10715

++		/*

10716

++		 * Check to see if this queue is scheduled to merge with

10717

++		 * another closely cooperating queue. The merging of queues

10718

++		 * happens here as it must be done in process context.

10719

++		 * The reference on new_bfqq was taken in merge_bfqqs.

10720

++		 */

10721

++		if (bfqq->new_bfqq != NULL)

10722

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

10723

++	}

10724

++

10725

++	bfqq->allocated[rw]++;

10726

++	atomic_inc(&bfqq->ref);

10727

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

10728

++		     atomic_read(&bfqq->ref));

10729

++

10730

++	rq->elv.priv[0] = bic;

10731

++	rq->elv.priv[1] = bfqq;

10732

++

10733

++	spin_unlock_irqrestore(q->queue_lock, flags);

10734

++

10735

++	return 0;

10736

++

10737

++queue_fail:

10738

++	bfq_schedule_dispatch(bfqd);

10739

++	spin_unlock_irqrestore(q->queue_lock, flags);

10740

++

10741

++	return 1;

10742

++}

10743

++

10744

++static void bfq_kick_queue(struct work_struct *work)

10745

++{

10746

++	struct bfq_data *bfqd =

10747

++		container_of(work, struct bfq_data, unplug_work);

10748

++	struct request_queue *q = bfqd->queue;

10749

++

10750

++	spin_lock_irq(q->queue_lock);

10751

++	__blk_run_queue(q);

10752

++	spin_unlock_irq(q->queue_lock);

10753

++}

10754

++

10755

++/*

10756

++ * Handler of the expiration of the timer running if the active_queue

10757

++ * is idling inside its time slice.

10758

++ */

10759

++static void bfq_idle_slice_timer(unsigned long data)

10760

++{

10761

++	struct bfq_data *bfqd = (struct bfq_data *)data;

10762

++	struct bfq_queue *bfqq;

10763

++	unsigned long flags;

10764

++	enum bfqq_expiration reason;

10765

++

10766

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

10767

++

10768

++	bfqq = bfqd->active_queue;

10769

++	/*

10770

++	 * Theoretical race here: active_queue can be NULL or different

10771

++	 * from the queue that was idling if the timer handler spins on

10772

++	 * the queue_lock and a new request arrives for the current

10773

++	 * queue and there is a full dispatch cycle that changes the

10774

++	 * active_queue.  This can hardly happen, but in the worst case

10775

++	 * we just expire a queue too early.

10776

++	 */

10777

++	if (bfqq != NULL) {

10778

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

10779

++		if (bfq_bfqq_budget_timeout(bfqq))

10780

++			/*

10781

++			 * Also here the queue can be safely expired

10782

++			 * for budget timeout without wasting

10783

++			 * guarantees

10784

++			 */

10785

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

10786

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

10787

++			/*

10788

++			 * The queue may not be empty upon timer expiration,

10789

++			 * because we may not disable the timer when the first

10790

++			 * request of the active queue arrives during

10791

++			 * disk idling

10792

++			 */

10793

++			reason = BFQ_BFQQ_TOO_IDLE;

10794

++		else

10795

++			goto schedule_dispatch;

10796

++

10797

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

10798

++	}

10799

++

10800

++schedule_dispatch:

10801

++	bfq_schedule_dispatch(bfqd);

10802

++

10803

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

10804

++}

10805

++

10806

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

10807

++{

10808

++	del_timer_sync(&bfqd->idle_slice_timer);

10809

++	cancel_work_sync(&bfqd->unplug_work);

10810

++}

10811

++

10812

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

10813

++					struct bfq_queue **bfqq_ptr)

10814

++{

10815

++	struct bfq_group *root_group = bfqd->root_group;

10816

++	struct bfq_queue *bfqq = *bfqq_ptr;

10817

++

10818

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

10819

++	if (bfqq != NULL) {

10820

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

10821

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

10822

++			     bfqq, atomic_read(&bfqq->ref));

10823

++		bfq_put_queue(bfqq);

10824

++		*bfqq_ptr = NULL;

10825

++	}

10826

++}

10827

++

10828

++/*

10829

++ * Release all the bfqg references to its async queues.  If we are

10830

++ * deallocating the group these queues may still contain requests, so

10831

++ * we reparent them to the root cgroup (i.e., the only one that will

10832

++ * exist for sure untill all the requests on a device are gone).

10833

++ */

10834

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

10835

++{

10836

++	int i, j;

10837

++

10838

++	for (i = 0; i < 2; i++)

10839

++		for (j = 0; j < IOPRIO_BE_NR; j++)

10840

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

10841

++

10842

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

10843

++}

10844

++

10845

++static void bfq_exit_queue(struct elevator_queue *e)

10846

++{

10847

++	struct bfq_data *bfqd = e->elevator_data;

10848

++	struct request_queue *q = bfqd->queue;

10849

++	struct bfq_queue *bfqq, *n;

10850

++

10851

++	bfq_shutdown_timer_wq(bfqd);

10852

++

10853

++	spin_lock_irq(q->queue_lock);

10854

++

10855

++	BUG_ON(bfqd->active_queue != NULL);

10856

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

10857

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

10858

++

10859

++	bfq_disconnect_groups(bfqd);

10860

++	spin_unlock_irq(q->queue_lock);

10861

++

10862

++	bfq_shutdown_timer_wq(bfqd);

10863

++

10864

++	synchronize_rcu();

10865

++

10866

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

10867

++

10868

++	bfq_free_root_group(bfqd);

10869

++	kfree(bfqd);

10870

++}

10871

++

10872

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

10873

++{

10874

++	struct bfq_group *bfqg;

10875

++	struct bfq_data *bfqd;

10876

++	struct elevator_queue *eq;

10877

++

10878

++	eq = elevator_alloc(q, e);

10879

++	if (eq == NULL)

10880

++		return -ENOMEM;

10881

++

10882

++	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);

10883

++	if (bfqd == NULL) {

10884

++		kobject_put(&eq->kobj);

10885

++		return -ENOMEM;

10886

++	}

10887

++	eq->elevator_data = bfqd;

10888

++

10889

++	/*

10890

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

10891

++	 * Grab a permanent reference to it, so that the normal code flow

10892

++	 * will not attempt to free it.

10893

++	 */

10894

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

10895

++	atomic_inc(&bfqd->oom_bfqq.ref);

10896

++

10897

++	bfqd->queue = q;

10898

++

10899

++	spin_lock_irq(q->queue_lock);

10900

++	q->elevator = eq;

10901

++	spin_unlock_irq(q->queue_lock);

10902

++

10903

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

10904

++	if (bfqg == NULL) {

10905

++		kfree(bfqd);

10906

++		kobject_put(&eq->kobj);

10907

++		return -ENOMEM;

10908

++	}

10909

++

10910

++	bfqd->root_group = bfqg;

10911

++

10912

++	init_timer(&bfqd->idle_slice_timer);

10913

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

10914

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

10915

++

10916

++	bfqd->rq_pos_tree = RB_ROOT;

10917

++

10918

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

10919

++

10920

++	INIT_LIST_HEAD(&bfqd->active_list);

10921

++	INIT_LIST_HEAD(&bfqd->idle_list);

10922

++

10923

++	bfqd->hw_tag = -1;

10924

++

10925

++	bfqd->bfq_max_budget = bfq_default_max_budget;

10926

++

10927

++	bfqd->bfq_quantum = bfq_quantum;

10928

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

10929

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

10930

++	bfqd->bfq_back_max = bfq_back_max;

10931

++	bfqd->bfq_back_penalty = bfq_back_penalty;

10932

++	bfqd->bfq_slice_idle = bfq_slice_idle;

10933

++	bfqd->bfq_class_idle_last_service = 0;

10934

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

10935

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

10936

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

10937

++

10938

++	bfqd->low_latency = true;

10939

++

10940

++	bfqd->bfq_raising_coeff = 20;

10941

++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);

10942

++	bfqd->bfq_raising_max_time = 0;

10943

++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);

10944

++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);

10945

++	bfqd->bfq_raising_max_softrt_rate = 7000;

10946

++

10947

++	/* Initially estimate the device's peak rate as the reference rate */

10948

++	if (blk_queue_nonrot(bfqd->queue)) {

10949

++		bfqd->RT_prod = R_nonrot * T_nonrot;

10950

++		bfqd->peak_rate = R_nonrot;

10951

++	} else {

10952

++		bfqd->RT_prod = R_rot * T_rot;

10953

++		bfqd->peak_rate = R_rot;

10954

++	}

10955

++

10956

++	return 0;

10957

++}

10958

++

10959

++static void bfq_slab_kill(void)

10960

++{

10961

++	if (bfq_pool != NULL)

10962

++		kmem_cache_destroy(bfq_pool);

10963

++}

10964

++

10965

++static int __init bfq_slab_setup(void)

10966

++{

10967

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

10968

++	if (bfq_pool == NULL)

10969

++		return -ENOMEM;

10970

++	return 0;

10971

++}

10972

++

10973

++static ssize_t bfq_var_show(unsigned int var, char *page)

10974

++{

10975

++	return sprintf(page, "%d\n", var);

10976

++}

10977

++

10978

++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)

10979

++{

10980

++	unsigned long new_val;

10981

++	int ret = strict_strtoul(page, 10, &new_val);

10982

++

10983

++	if (ret == 0)

10984

++		*var = new_val;

10985

++

10986

++	return count;

10987

++}

10988

++

10989

++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)

10990

++{

10991

++	struct bfq_data *bfqd = e->elevator_data;

10992

++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?

10993

++		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :

10994

++		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));

10995

++}

10996

++

10997

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

10998

++{

10999

++	struct bfq_queue *bfqq;

11000

++	struct bfq_data *bfqd = e->elevator_data;

11001

++	ssize_t num_char = 0;

11002

++

11003

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

11004

++			    bfqd->queued);

11005

++

11006

++	spin_lock_irq(bfqd->queue->queue_lock);

11007

++

11008

++	num_char += sprintf(page + num_char, "Active:\n");

11009

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

11010

++	  num_char += sprintf(page + num_char,

11011

++			      "pid%d: weight %hu, nr_queued %d %d,"

11012

++			      " dur %d/%u\n",

11013

++			      bfqq->pid,

11014

++			      bfqq->entity.weight,

11015

++			      bfqq->queued[0],

11016

++			      bfqq->queued[1],

11017

++			jiffies_to_msecs(jiffies -

11018

++				bfqq->last_rais_start_finish),

11019

++			jiffies_to_msecs(bfqq->raising_cur_max_time));

11020

++	}

11021

++

11022

++	num_char += sprintf(page + num_char, "Idle:\n");

11023

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

11024

++			num_char += sprintf(page + num_char,

11025

++				"pid%d: weight %hu, dur %d/%u\n",

11026

++				bfqq->pid,

11027

++				bfqq->entity.weight,

11028

++				jiffies_to_msecs(jiffies -

11029

++					bfqq->last_rais_start_finish),

11030

++				jiffies_to_msecs(bfqq->raising_cur_max_time));

11031

++	}

11032

++

11033

++	spin_unlock_irq(bfqd->queue->queue_lock);

11034

++

11035

++	return num_char;

11036

++}

11037

++

11038

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

11039

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

11040

++{									\

11041

++	struct bfq_data *bfqd = e->elevator_data;			\

11042

++	unsigned int __data = __VAR;					\

11043

++	if (__CONV)							\

11044

++		__data = jiffies_to_msecs(__data);			\

11045

++	return bfq_var_show(__data, (page));				\

11046

++}

11047

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

11048

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

11049

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

11050

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

11051

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

11052

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

11053

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

11054

++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);

11055

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

11056

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

11057

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

11058

++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);

11059

++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);

11060

++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,

11061

++	1);

11062

++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,

11063

++	bfqd->bfq_raising_min_inter_arr_async,

11064

++	1);

11065

++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,

11066

++	bfqd->bfq_raising_max_softrt_rate, 0);

11067

++#undef SHOW_FUNCTION

11068

++

11069

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

11070

++static ssize_t								\

11071

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

11072

++{									\

11073

++	struct bfq_data *bfqd = e->elevator_data;			\

11074

++	unsigned long uninitialized_var(__data);			\

11075

++	int ret = bfq_var_store(&__data, (page), count);		\

11076

++	if (__data < (MIN))						\

11077

++		__data = (MIN);						\

11078

++	else if (__data > (MAX))					\

11079

++		__data = (MAX);						\

11080

++	if (__CONV)							\

11081

++		*(__PTR) = msecs_to_jiffies(__data);			\

11082

++	else								\

11083

++		*(__PTR) = __data;					\

11084

++	return ret;							\

11085

++}

11086

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

11087

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

11088

++		INT_MAX, 1);

11089

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

11090

++		INT_MAX, 1);

11091

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

11092

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

11093

++		INT_MAX, 0);

11094

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

11095

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

11096

++		1, INT_MAX, 0);

11097

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

11098

++		INT_MAX, 1);

11099

++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,

11100

++		INT_MAX, 0);

11101

++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,

11102

++		INT_MAX, 1);

11103

++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,

11104

++		INT_MAX, 1);

11105

++STORE_FUNCTION(bfq_raising_min_idle_time_store,

11106

++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);

11107

++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,

11108

++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);

11109

++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,

11110

++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);

11111

++#undef STORE_FUNCTION

11112

++

11113

++/* do nothing for the moment */

11114

++static ssize_t bfq_weights_store(struct elevator_queue *e,

11115

++				    const char *page, size_t count)

11116

++{

11117

++	return count;

11118

++}

11119

++

11120

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

11121

++{

11122

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

11123

++

11124

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

11125

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

11126

++	else

11127

++		return bfq_default_max_budget;

11128

++}

11129

++

11130

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

11131

++				    const char *page, size_t count)

11132

++{

11133

++	struct bfq_data *bfqd = e->elevator_data;

11134

++	unsigned long uninitialized_var(__data);

11135

++	int ret = bfq_var_store(&__data, (page), count);

11136

++

11137

++	if (__data == 0)

11138

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

11139

++	else {

11140

++		if (__data > INT_MAX)

11141

++			__data = INT_MAX;

11142

++		bfqd->bfq_max_budget = __data;

11143

++	}

11144

++

11145

++	bfqd->bfq_user_max_budget = __data;

11146

++

11147

++	return ret;

11148

++}

11149

++

11150

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

11151

++				      const char *page, size_t count)

11152

++{

11153

++	struct bfq_data *bfqd = e->elevator_data;

11154

++	unsigned long uninitialized_var(__data);

11155

++	int ret = bfq_var_store(&__data, (page), count);

11156

++

11157

++	if (__data < 1)

11158

++		__data = 1;

11159

++	else if (__data > INT_MAX)

11160

++		__data = INT_MAX;

11161

++

11162

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

11163

++	if (bfqd->bfq_user_max_budget == 0)

11164

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

11165

++

11166

++	return ret;

11167

++}

11168

++

11169

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

11170

++				     const char *page, size_t count)

11171

++{

11172

++	struct bfq_data *bfqd = e->elevator_data;

11173

++	unsigned long uninitialized_var(__data);

11174

++	int ret = bfq_var_store(&__data, (page), count);

11175

++

11176

++	if (__data > 1)

11177

++		__data = 1;

11178

++	if (__data == 0 && bfqd->low_latency != 0)

11179

++		bfq_end_raising(bfqd);

11180

++	bfqd->low_latency = __data;

11181

++

11182

++	return ret;

11183

++}

11184

++

11185

++#define BFQ_ATTR(name) \

11186

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

11187

++

11188

++static struct elv_fs_entry bfq_attrs[] = {

11189

++	BFQ_ATTR(quantum),

11190

++	BFQ_ATTR(fifo_expire_sync),

11191

++	BFQ_ATTR(fifo_expire_async),

11192

++	BFQ_ATTR(back_seek_max),

11193

++	BFQ_ATTR(back_seek_penalty),

11194

++	BFQ_ATTR(slice_idle),

11195

++	BFQ_ATTR(max_budget),

11196

++	BFQ_ATTR(max_budget_async_rq),

11197

++	BFQ_ATTR(timeout_sync),

11198

++	BFQ_ATTR(timeout_async),

11199

++	BFQ_ATTR(low_latency),

11200

++	BFQ_ATTR(raising_coeff),

11201

++	BFQ_ATTR(raising_max_time),

11202

++	BFQ_ATTR(raising_rt_max_time),

11203

++	BFQ_ATTR(raising_min_idle_time),

11204

++	BFQ_ATTR(raising_min_inter_arr_async),

11205

++	BFQ_ATTR(raising_max_softrt_rate),

11206

++	BFQ_ATTR(weights),

11207

++	__ATTR_NULL

11208

++};

11209

++

11210

++static struct elevator_type iosched_bfq = {

11211

++	.ops = {

11212

++		.elevator_merge_fn =		bfq_merge,

11213

++		.elevator_merged_fn =		bfq_merged_request,

11214

++		.elevator_merge_req_fn =	bfq_merged_requests,

11215

++		.elevator_allow_merge_fn =	bfq_allow_merge,

11216

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

11217

++		.elevator_add_req_fn =		bfq_insert_request,

11218

++		.elevator_activate_req_fn =	bfq_activate_request,

11219

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

11220

++		.elevator_completed_req_fn =	bfq_completed_request,

11221

++		.elevator_former_req_fn =	elv_rb_former_request,

11222

++		.elevator_latter_req_fn =	elv_rb_latter_request,

11223

++		.elevator_init_icq_fn =		bfq_init_icq,

11224

++		.elevator_exit_icq_fn =		bfq_exit_icq,

11225

++		.elevator_set_req_fn =		bfq_set_request,

11226

++		.elevator_put_req_fn =		bfq_put_request,

11227

++		.elevator_may_queue_fn =	bfq_may_queue,

11228

++		.elevator_init_fn =		bfq_init_queue,

11229

++		.elevator_exit_fn =		bfq_exit_queue,

11230

++	},

11231

++	.icq_size =		sizeof(struct bfq_io_cq),

11232

++	.icq_align =		__alignof__(struct bfq_io_cq),

11233

++	.elevator_attrs =	bfq_attrs,

11234

++	.elevator_name =	"bfq",

11235

++	.elevator_owner =	THIS_MODULE,

11236

++};

11237

++

11238

++static int __init bfq_init(void)

11239

++{

11240

++	/*

11241

++	 * Can be 0 on HZ < 1000 setups.

11242

++	 */

11243

++	if (bfq_slice_idle == 0)

11244

++		bfq_slice_idle = 1;

11245

++

11246

++	if (bfq_timeout_async == 0)

11247

++		bfq_timeout_async = 1;

11248

++

11249

++	if (bfq_slab_setup())

11250

++		return -ENOMEM;

11251

++

11252

++	elv_register(&iosched_bfq);

11253

++

11254

++	return 0;

11255

++}

11256

++

11257

++static void __exit bfq_exit(void)

11258

++{

11259

++	elv_unregister(&iosched_bfq);

11260

++	bfq_slab_kill();

11261

++}

11262

++

11263

++module_init(bfq_init);

11264

++module_exit(bfq_exit);

11265

++

11266

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

11267

++MODULE_LICENSE("GPL");

11268

++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");

11269

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

11270

+new file mode 100644

11271

+index 0000000..03f8061

11272

+--- /dev/null

11273

++++ b/block/bfq-sched.c

11274

+@@ -0,0 +1,1072 @@

11275

++/*

11276

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

11277

++ *

11278

++ * Based on ideas and code from CFQ:

11279

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

11280

++ *

11281

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

11282

++ *		      Paolo Valente <paolo.valente@×××××××.it>

11283

++ *

11284

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

11285

++ */

11286

++

11287

++#ifdef CONFIG_CGROUP_BFQIO

11288

++#define for_each_entity(entity)	\

11289

++	for (; entity != NULL; entity = entity->parent)

11290

++

11291

++#define for_each_entity_safe(entity, parent) \

11292

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

11293

++

11294

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

11295

++						 int extract,

11296

++						 struct bfq_data *bfqd);

11297

++

11298

++static inline void bfq_update_budget(struct bfq_entity *next_active)

11299

++{

11300

++	struct bfq_entity *bfqg_entity;

11301

++	struct bfq_group *bfqg;

11302

++	struct bfq_sched_data *group_sd;

11303

++

11304

++	BUG_ON(next_active == NULL);

11305

++

11306

++	group_sd = next_active->sched_data;

11307

++

11308

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

11309

++	/*

11310

++	 * bfq_group's my_entity field is not NULL only if the group

11311

++	 * is not the root group. We must not touch the root entity

11312

++	 * as it must never become an active entity.

11313

++	 */

11314

++	bfqg_entity = bfqg->my_entity;

11315

++	if (bfqg_entity != NULL)

11316

++		bfqg_entity->budget = next_active->budget;

11317

++}

11318

++

11319

++static int bfq_update_next_active(struct bfq_sched_data *sd)

11320

++{

11321

++	struct bfq_entity *next_active;

11322

++

11323

++	if (sd->active_entity != NULL)

11324

++		/* will update/requeue at the end of service */

11325

++		return 0;

11326

++

11327

++	/*

11328

++	 * NOTE: this can be improved in many ways, such as returning

11329

++	 * 1 (and thus propagating upwards the update) only when the

11330

++	 * budget changes, or caching the bfqq that will be scheduled

11331

++	 * next from this subtree.  By now we worry more about

11332

++	 * correctness than about performance...

11333

++	 */

11334

++	next_active = bfq_lookup_next_entity(sd, 0, NULL);

11335

++	sd->next_active = next_active;

11336

++

11337

++	if (next_active != NULL)

11338

++		bfq_update_budget(next_active);

11339

++

11340

++	return 1;

11341

++}

11342

++

11343

++static inline void bfq_check_next_active(struct bfq_sched_data *sd,

11344

++					 struct bfq_entity *entity)

11345

++{

11346

++	BUG_ON(sd->next_active != entity);

11347

++}

11348

++#else

11349

++#define for_each_entity(entity)	\

11350

++	for (; entity != NULL; entity = NULL)

11351

++

11352

++#define for_each_entity_safe(entity, parent) \

11353

++	for (parent = NULL; entity != NULL; entity = parent)

11354

++

11355

++static inline int bfq_update_next_active(struct bfq_sched_data *sd)

11356

++{

11357

++	return 0;

11358

++}

11359

++

11360

++static inline void bfq_check_next_active(struct bfq_sched_data *sd,

11361

++					 struct bfq_entity *entity)

11362

++{

11363

++}

11364

++

11365

++static inline void bfq_update_budget(struct bfq_entity *next_active)

11366

++{

11367

++}

11368

++#endif

11369

++

11370

++/*

11371

++ * Shift for timestamp calculations.  This actually limits the maximum

11372

++ * service allowed in one timestamp delta (small shift values increase it),

11373

++ * the maximum total weight that can be used for the queues in the system

11374

++ * (big shift values increase it), and the period of virtual time wraparounds.

11375

++ */

11376

++#define WFQ_SERVICE_SHIFT	22

11377

++

11378

++/**

11379

++ * bfq_gt - compare two timestamps.

11380

++ * @a: first ts.

11381

++ * @b: second ts.

11382

++ *

11383

++ * Return @a > @b, dealing with wrapping correctly.

11384

++ */

11385

++static inline int bfq_gt(u64 a, u64 b)

11386

++{

11387

++	return (s64)(a - b) > 0;

11388

++}

11389

++

11390

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

11391

++{

11392

++	struct bfq_queue *bfqq = NULL;

11393

++

11394

++	BUG_ON(entity == NULL);

11395

++

11396

++	if (entity->my_sched_data == NULL)

11397

++		bfqq = container_of(entity, struct bfq_queue, entity);

11398

++

11399

++	return bfqq;

11400

++}

11401

++

11402

++

11403

++/**

11404

++ * bfq_delta - map service into the virtual time domain.

11405

++ * @service: amount of service.

11406

++ * @weight: scale factor (weight of an entity or weight sum).

11407

++ */

11408

++static inline u64 bfq_delta(unsigned long service,

11409

++					unsigned long weight)

11410

++{

11411

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

11412

++

11413

++	do_div(d, weight);

11414

++	return d;

11415

++}

11416

++

11417

++/**

11418

++ * bfq_calc_finish - assign the finish time to an entity.

11419

++ * @entity: the entity to act upon.

11420

++ * @service: the service to be charged to the entity.

11421

++ */

11422

++static inline void bfq_calc_finish(struct bfq_entity *entity,

11423

++				   unsigned long service)

11424

++{

11425

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11426

++

11427

++	BUG_ON(entity->weight == 0);

11428

++

11429

++	entity->finish = entity->start +

11430

++		bfq_delta(service, entity->weight);

11431

++

11432

++	if (bfqq != NULL) {

11433

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

11434

++			"calc_finish: serv %lu, w %d",

11435

++			service, entity->weight);

11436

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

11437

++			"calc_finish: start %llu, finish %llu, delta %llu",

11438

++			entity->start, entity->finish,

11439

++			bfq_delta(service, entity->weight));

11440

++	}

11441

++}

11442

++

11443

++/**

11444

++ * bfq_entity_of - get an entity from a node.

11445

++ * @node: the node field of the entity.

11446

++ *

11447

++ * Convert a node pointer to the relative entity.  This is used only

11448

++ * to simplify the logic of some functions and not as the generic

11449

++ * conversion mechanism because, e.g., in the tree walking functions,

11450

++ * the check for a %NULL value would be redundant.

11451

++ */

11452

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

11453

++{

11454

++	struct bfq_entity *entity = NULL;

11455

++

11456

++	if (node != NULL)

11457

++		entity = rb_entry(node, struct bfq_entity, rb_node);

11458

++

11459

++	return entity;

11460

++}

11461

++

11462

++/**

11463

++ * bfq_extract - remove an entity from a tree.

11464

++ * @root: the tree root.

11465

++ * @entity: the entity to remove.

11466

++ */

11467

++static inline void bfq_extract(struct rb_root *root,

11468

++			       struct bfq_entity *entity)

11469

++{

11470

++	BUG_ON(entity->tree != root);

11471

++

11472

++	entity->tree = NULL;

11473

++	rb_erase(&entity->rb_node, root);

11474

++}

11475

++

11476

++/**

11477

++ * bfq_idle_extract - extract an entity from the idle tree.

11478

++ * @st: the service tree of the owning @entity.

11479

++ * @entity: the entity being removed.

11480

++ */

11481

++static void bfq_idle_extract(struct bfq_service_tree *st,

11482

++			     struct bfq_entity *entity)

11483

++{

11484

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11485

++	struct rb_node *next;

11486

++

11487

++	BUG_ON(entity->tree != &st->idle);

11488

++

11489

++	if (entity == st->first_idle) {

11490

++		next = rb_next(&entity->rb_node);

11491

++		st->first_idle = bfq_entity_of(next);

11492

++	}

11493

++

11494

++	if (entity == st->last_idle) {

11495

++		next = rb_prev(&entity->rb_node);

11496

++		st->last_idle = bfq_entity_of(next);

11497

++	}

11498

++

11499

++	bfq_extract(&st->idle, entity);

11500

++

11501

++	if (bfqq != NULL)

11502

++		list_del(&bfqq->bfqq_list);

11503

++}

11504

++

11505

++/**

11506

++ * bfq_insert - generic tree insertion.

11507

++ * @root: tree root.

11508

++ * @entity: entity to insert.

11509

++ *

11510

++ * This is used for the idle and the active tree, since they are both

11511

++ * ordered by finish time.

11512

++ */

11513

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

11514

++{

11515

++	struct bfq_entity *entry;

11516

++	struct rb_node **node = &root->rb_node;

11517

++	struct rb_node *parent = NULL;

11518

++

11519

++	BUG_ON(entity->tree != NULL);

11520

++

11521

++	while (*node != NULL) {

11522

++		parent = *node;

11523

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

11524

++

11525

++		if (bfq_gt(entry->finish, entity->finish))

11526

++			node = &parent->rb_left;

11527

++		else

11528

++			node = &parent->rb_right;

11529

++	}

11530

++

11531

++	rb_link_node(&entity->rb_node, parent, node);

11532

++	rb_insert_color(&entity->rb_node, root);

11533

++

11534

++	entity->tree = root;

11535

++}

11536

++

11537

++/**

11538

++ * bfq_update_min - update the min_start field of a entity.

11539

++ * @entity: the entity to update.

11540

++ * @node: one of its children.

11541

++ *

11542

++ * This function is called when @entity may store an invalid value for

11543

++ * min_start due to updates to the active tree.  The function  assumes

11544

++ * that the subtree rooted at @node (which may be its left or its right

11545

++ * child) has a valid min_start value.

11546

++ */

11547

++static inline void bfq_update_min(struct bfq_entity *entity,

11548

++				  struct rb_node *node)

11549

++{

11550

++	struct bfq_entity *child;

11551

++

11552

++	if (node != NULL) {

11553

++		child = rb_entry(node, struct bfq_entity, rb_node);

11554

++		if (bfq_gt(entity->min_start, child->min_start))

11555

++			entity->min_start = child->min_start;

11556

++	}

11557

++}

11558

++

11559

++/**

11560

++ * bfq_update_active_node - recalculate min_start.

11561

++ * @node: the node to update.

11562

++ *

11563

++ * @node may have changed position or one of its children may have moved,

11564

++ * this function updates its min_start value.  The left and right subtrees

11565

++ * are assumed to hold a correct min_start value.

11566

++ */

11567

++static inline void bfq_update_active_node(struct rb_node *node)

11568

++{

11569

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

11570

++

11571

++	entity->min_start = entity->start;

11572

++	bfq_update_min(entity, node->rb_right);

11573

++	bfq_update_min(entity, node->rb_left);

11574

++}

11575

++

11576

++/**

11577

++ * bfq_update_active_tree - update min_start for the whole active tree.

11578

++ * @node: the starting node.

11579

++ *

11580

++ * @node must be the deepest modified node after an update.  This function

11581

++ * updates its min_start using the values held by its children, assuming

11582

++ * that they did not change, and then updates all the nodes that may have

11583

++ * changed in the path to the root.  The only nodes that may have changed

11584

++ * are the ones in the path or their siblings.

11585

++ */

11586

++static void bfq_update_active_tree(struct rb_node *node)

11587

++{

11588

++	struct rb_node *parent;

11589

++

11590

++up:

11591

++	bfq_update_active_node(node);

11592

++

11593

++	parent = rb_parent(node);

11594

++	if (parent == NULL)

11595

++		return;

11596

++

11597

++	if (node == parent->rb_left && parent->rb_right != NULL)

11598

++		bfq_update_active_node(parent->rb_right);

11599

++	else if (parent->rb_left != NULL)

11600

++		bfq_update_active_node(parent->rb_left);

11601

++

11602

++	node = parent;

11603

++	goto up;

11604

++}

11605

++

11606

++/**

11607

++ * bfq_active_insert - insert an entity in the active tree of its group/device.

11608

++ * @st: the service tree of the entity.

11609

++ * @entity: the entity being inserted.

11610

++ *

11611

++ * The active tree is ordered by finish time, but an extra key is kept

11612

++ * per each node, containing the minimum value for the start times of

11613

++ * its children (and the node itself), so it's possible to search for

11614

++ * the eligible node with the lowest finish time in logarithmic time.

11615

++ */

11616

++static void bfq_active_insert(struct bfq_service_tree *st,

11617

++			      struct bfq_entity *entity)

11618

++{

11619

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11620

++	struct rb_node *node = &entity->rb_node;

11621

++

11622

++	bfq_insert(&st->active, entity);

11623

++

11624

++	if (node->rb_left != NULL)

11625

++		node = node->rb_left;

11626

++	else if (node->rb_right != NULL)

11627

++		node = node->rb_right;

11628

++

11629

++	bfq_update_active_tree(node);

11630

++

11631

++	if (bfqq != NULL)

11632

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

11633

++}

11634

++

11635

++/**

11636

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

11637

++ * @ioprio: the ioprio value to convert.

11638

++ */

11639

++static unsigned short bfq_ioprio_to_weight(int ioprio)

11640

++{

11641

++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

11642

++	return IOPRIO_BE_NR - ioprio;

11643

++}

11644

++

11645

++/**

11646

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

11647

++ * @weight: the weight value to convert.

11648

++ *

11649

++ * To preserve as mush as possible the old only-ioprio user interface,

11650

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

11651

++ * larger than IOPRIO_BE_NR

11652

++ */

11653

++static unsigned short bfq_weight_to_ioprio(int weight)

11654

++{

11655

++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

11656

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

11657

++}

11658

++

11659

++static inline void bfq_get_entity(struct bfq_entity *entity)

11660

++{

11661

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11662

++	struct bfq_sched_data *sd;

11663

++

11664

++	if (bfqq != NULL) {

11665

++		sd = entity->sched_data;

11666

++		atomic_inc(&bfqq->ref);

11667

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

11668

++			     bfqq, atomic_read(&bfqq->ref));

11669

++	}

11670

++}

11671

++

11672

++/**

11673

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

11674

++ * @node: the node being removed.

11675

++ *

11676

++ * Do the first step of an extraction in an rb tree, looking for the

11677

++ * node that will replace @node, and returning the deepest node that

11678

++ * the following modifications to the tree can touch.  If @node is the

11679

++ * last node in the tree return %NULL.

11680

++ */

11681

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

11682

++{

11683

++	struct rb_node *deepest;

11684

++

11685

++	if (node->rb_right == NULL && node->rb_left == NULL)

11686

++		deepest = rb_parent(node);

11687

++	else if (node->rb_right == NULL)

11688

++		deepest = node->rb_left;

11689

++	else if (node->rb_left == NULL)

11690

++		deepest = node->rb_right;

11691

++	else {

11692

++		deepest = rb_next(node);

11693

++		if (deepest->rb_right != NULL)

11694

++			deepest = deepest->rb_right;

11695

++		else if (rb_parent(deepest) != node)

11696

++			deepest = rb_parent(deepest);

11697

++	}

11698

++

11699

++	return deepest;

11700

++}

11701

++

11702

++/**

11703

++ * bfq_active_extract - remove an entity from the active tree.

11704

++ * @st: the service_tree containing the tree.

11705

++ * @entity: the entity being removed.

11706

++ */

11707

++static void bfq_active_extract(struct bfq_service_tree *st,

11708

++			       struct bfq_entity *entity)

11709

++{

11710

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11711

++	struct rb_node *node;

11712

++

11713

++	node = bfq_find_deepest(&entity->rb_node);

11714

++	bfq_extract(&st->active, entity);

11715

++

11716

++	if (node != NULL)

11717

++		bfq_update_active_tree(node);

11718

++

11719

++	if (bfqq != NULL)

11720

++		list_del(&bfqq->bfqq_list);

11721

++}

11722

++

11723

++/**

11724

++ * bfq_idle_insert - insert an entity into the idle tree.

11725

++ * @st: the service tree containing the tree.

11726

++ * @entity: the entity to insert.

11727

++ */

11728

++static void bfq_idle_insert(struct bfq_service_tree *st,

11729

++			    struct bfq_entity *entity)

11730

++{

11731

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11732

++	struct bfq_entity *first_idle = st->first_idle;

11733

++	struct bfq_entity *last_idle = st->last_idle;

11734

++

11735

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

11736

++		st->first_idle = entity;

11737

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

11738

++		st->last_idle = entity;

11739

++

11740

++	bfq_insert(&st->idle, entity);

11741

++

11742

++	if (bfqq != NULL)

11743

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

11744

++}

11745

++

11746

++/**

11747

++ * bfq_forget_entity - remove an entity from the wfq trees.

11748

++ * @st: the service tree.

11749

++ * @entity: the entity being removed.

11750

++ *

11751

++ * Update the device status and forget everything about @entity, putting

11752

++ * the device reference to it, if it is a queue.  Entities belonging to

11753

++ * groups are not refcounted.

11754

++ */

11755

++static void bfq_forget_entity(struct bfq_service_tree *st,

11756

++			      struct bfq_entity *entity)

11757

++{

11758

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11759

++	struct bfq_sched_data *sd;

11760

++

11761

++	BUG_ON(!entity->on_st);

11762

++

11763

++	entity->on_st = 0;

11764

++	st->wsum -= entity->weight;

11765

++	if (bfqq != NULL) {

11766

++		sd = entity->sched_data;

11767

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

11768

++			     bfqq, atomic_read(&bfqq->ref));

11769

++		bfq_put_queue(bfqq);

11770

++	}

11771

++}

11772

++

11773

++/**

11774

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

11775

++ * @st: service tree for the entity.

11776

++ * @entity: the entity being released.

11777

++ */

11778

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

11779

++				struct bfq_entity *entity)

11780

++{

11781

++	bfq_idle_extract(st, entity);

11782

++	bfq_forget_entity(st, entity);

11783

++}

11784

++

11785

++/**

11786

++ * bfq_forget_idle - update the idle tree if necessary.

11787

++ * @st: the service tree to act upon.

11788

++ *

11789

++ * To preserve the global O(log N) complexity we only remove one entry here;

11790

++ * as the idle tree will not grow indefinitely this can be done safely.

11791

++ */

11792

++static void bfq_forget_idle(struct bfq_service_tree *st)

11793

++{

11794

++	struct bfq_entity *first_idle = st->first_idle;

11795

++	struct bfq_entity *last_idle = st->last_idle;

11796

++

11797

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

11798

++	    !bfq_gt(last_idle->finish, st->vtime)) {

11799

++		/*

11800

++		 * Forget the whole idle tree, increasing the vtime past

11801

++		 * the last finish time of idle entities.

11802

++		 */

11803

++		st->vtime = last_idle->finish;

11804

++	}

11805

++

11806

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

11807

++		bfq_put_idle_entity(st, first_idle);

11808

++}

11809

++

11810

++static struct bfq_service_tree *

11811

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

11812

++			 struct bfq_entity *entity)

11813

++{

11814

++	struct bfq_service_tree *new_st = old_st;

11815

++

11816

++	if (entity->ioprio_changed) {

11817

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

11818

++

11819

++		BUG_ON(old_st->wsum < entity->weight);

11820

++		old_st->wsum -= entity->weight;

11821

++

11822

++		if (entity->new_weight != entity->orig_weight) {

11823

++			entity->orig_weight = entity->new_weight;

11824

++			entity->ioprio =

11825

++				bfq_weight_to_ioprio(entity->orig_weight);

11826

++		} else if (entity->new_ioprio != entity->ioprio) {

11827

++			entity->ioprio = entity->new_ioprio;

11828

++			entity->orig_weight =

11829

++					bfq_ioprio_to_weight(entity->ioprio);

11830

++		} else

11831

++			entity->new_weight = entity->orig_weight =

11832

++				bfq_ioprio_to_weight(entity->ioprio);

11833

++

11834

++		entity->ioprio_class = entity->new_ioprio_class;

11835

++		entity->ioprio_changed = 0;

11836

++

11837

++		/*

11838

++		 * NOTE: here we may be changing the weight too early,

11839

++		 * this will cause unfairness.  The correct approach

11840

++		 * would have required additional complexity to defer

11841

++		 * weight changes to the proper time instants (i.e.,

11842

++		 * when entity->finish <= old_st->vtime).

11843

++		 */

11844

++		new_st = bfq_entity_service_tree(entity);

11845

++		entity->weight = entity->orig_weight *

11846

++			(bfqq != NULL ? bfqq->raising_coeff : 1);

11847

++		new_st->wsum += entity->weight;

11848

++

11849

++		if (new_st != old_st)

11850

++			entity->start = new_st->vtime;

11851

++	}

11852

++

11853

++	return new_st;

11854

++}

11855

++

11856

++/**

11857

++ * bfq_bfqq_served - update the scheduler status after selection for service.

11858

++ * @bfqq: the queue being served.

11859

++ * @served: bytes to transfer.

11860

++ *

11861

++ * NOTE: this can be optimized, as the timestamps of upper level entities

11862

++ * are synchronized every time a new bfqq is selected for service.  By now,

11863

++ * we keep it to better check consistency.

11864

++ */

11865

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

11866

++{

11867

++	struct bfq_entity *entity = &bfqq->entity;

11868

++	struct bfq_service_tree *st;

11869

++

11870

++	for_each_entity(entity) {

11871

++		st = bfq_entity_service_tree(entity);

11872

++

11873

++		entity->service += served;

11874

++		BUG_ON(entity->service > entity->budget);

11875

++		BUG_ON(st->wsum == 0);

11876

++

11877

++		st->vtime += bfq_delta(served, st->wsum);

11878

++		bfq_forget_idle(st);

11879

++	}

11880

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

11881

++}

11882

++

11883

++/**

11884

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

11885

++ * @bfqq: the queue that needs a service update.

11886

++ *

11887

++ * When it's not possible to be fair in the service domain, because

11888

++ * a queue is not consuming its budget fast enough (the meaning of

11889

++ * fast depends on the timeout parameter), we charge it a full

11890

++ * budget.  In this way we should obtain a sort of time-domain

11891

++ * fairness among all the seeky/slow queues.

11892

++ */

11893

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

11894

++{

11895

++	struct bfq_entity *entity = &bfqq->entity;

11896

++

11897

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

11898

++

11899

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

11900

++}

11901

++

11902

++/**

11903

++ * __bfq_activate_entity - activate an entity.

11904

++ * @entity: the entity being activated.

11905

++ *

11906

++ * Called whenever an entity is activated, i.e., it is not active and one

11907

++ * of its children receives a new request, or has to be reactivated due to

11908

++ * budget exhaustion.  It uses the current budget of the entity (and the

11909

++ * service received if @entity is active) of the queue to calculate its

11910

++ * timestamps.

11911

++ */

11912

++static void __bfq_activate_entity(struct bfq_entity *entity)

11913

++{

11914

++	struct bfq_sched_data *sd = entity->sched_data;

11915

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

11916

++

11917

++	if (entity == sd->active_entity) {

11918

++		BUG_ON(entity->tree != NULL);

11919

++		/*

11920

++		 * If we are requeueing the current entity we have

11921

++		 * to take care of not charging to it service it has

11922

++		 * not received.

11923

++		 */

11924

++		bfq_calc_finish(entity, entity->service);

11925

++		entity->start = entity->finish;

11926

++		sd->active_entity = NULL;

11927

++	} else if (entity->tree == &st->active) {

11928

++		/*

11929

++		 * Requeueing an entity due to a change of some

11930

++		 * next_active entity below it.  We reuse the old

11931

++		 * start time.

11932

++		 */

11933

++		bfq_active_extract(st, entity);

11934

++	} else if (entity->tree == &st->idle) {

11935

++		/*

11936

++		 * Must be on the idle tree, bfq_idle_extract() will

11937

++		 * check for that.

11938

++		 */

11939

++		bfq_idle_extract(st, entity);

11940

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

11941

++				       st->vtime : entity->finish;

11942

++	} else {

11943

++		/*

11944

++		 * The finish time of the entity may be invalid, and

11945

++		 * it is in the past for sure, otherwise the queue

11946

++		 * would have been on the idle tree.

11947

++		 */

11948

++		entity->start = st->vtime;

11949

++		st->wsum += entity->weight;

11950

++		bfq_get_entity(entity);

11951

++

11952

++		BUG_ON(entity->on_st);

11953

++		entity->on_st = 1;

11954

++	}

11955

++

11956

++	st = __bfq_entity_update_weight_prio(st, entity);

11957

++	bfq_calc_finish(entity, entity->budget);

11958

++	bfq_active_insert(st, entity);

11959

++}

11960

++

11961

++/**

11962

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

11963

++ * @entity: the entity to activate.

11964

++ *

11965

++ * Activate @entity and all the entities on the path from it to the root.

11966

++ */

11967

++static void bfq_activate_entity(struct bfq_entity *entity)

11968

++{

11969

++	struct bfq_sched_data *sd;

11970

++

11971

++	for_each_entity(entity) {

11972

++		__bfq_activate_entity(entity);

11973

++

11974

++		sd = entity->sched_data;

11975

++		if (!bfq_update_next_active(sd))

11976

++			/*

11977

++			 * No need to propagate the activation to the

11978

++			 * upper entities, as they will be updated when

11979

++			 * the active entity is rescheduled.

11980

++			 */

11981

++			break;

11982

++	}

11983

++}

11984

++

11985

++/**

11986

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

11987

++ * @entity: the entity to deactivate.

11988

++ * @requeue: if false, the entity will not be put into the idle tree.

11989

++ *

11990

++ * Deactivate an entity, independently from its previous state.  If the

11991

++ * entity was not on a service tree just return, otherwise if it is on

11992

++ * any scheduler tree, extract it from that tree, and if necessary

11993

++ * and if the caller did not specify @requeue, put it on the idle tree.

11994

++ *

11995

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

11996

++ * if the entity was under service or if it was the next_active for

11997

++ * its sched_data; return %0 otherwise.

11998

++ */

11999

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

12000

++{

12001

++	struct bfq_sched_data *sd = entity->sched_data;

12002

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

12003

++	int was_active = entity == sd->active_entity;

12004

++	int ret = 0;

12005

++

12006

++	if (!entity->on_st)

12007

++		return 0;

12008

++

12009

++	BUG_ON(was_active && entity->tree != NULL);

12010

++

12011

++	if (was_active) {

12012

++		bfq_calc_finish(entity, entity->service);

12013

++		sd->active_entity = NULL;

12014

++	} else if (entity->tree == &st->active)

12015

++		bfq_active_extract(st, entity);

12016

++	else if (entity->tree == &st->idle)

12017

++		bfq_idle_extract(st, entity);

12018

++	else if (entity->tree != NULL)

12019

++		BUG();

12020

++

12021

++	if (was_active || sd->next_active == entity)

12022

++		ret = bfq_update_next_active(sd);

12023

++

12024

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

12025

++		bfq_forget_entity(st, entity);

12026

++	else

12027

++		bfq_idle_insert(st, entity);

12028

++

12029

++	BUG_ON(sd->active_entity == entity);

12030

++	BUG_ON(sd->next_active == entity);

12031

++

12032

++	return ret;

12033

++}

12034

++

12035

++/**

12036

++ * bfq_deactivate_entity - deactivate an entity.

12037

++ * @entity: the entity to deactivate.

12038

++ * @requeue: true if the entity can be put on the idle tree

12039

++ */

12040

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

12041

++{

12042

++	struct bfq_sched_data *sd;

12043

++	struct bfq_entity *parent;

12044

++

12045

++	for_each_entity_safe(entity, parent) {

12046

++		sd = entity->sched_data;

12047

++

12048

++		if (!__bfq_deactivate_entity(entity, requeue))

12049

++			/*

12050

++			 * The parent entity is still backlogged, and

12051

++			 * we don't need to update it as it is still

12052

++			 * under service.

12053

++			 */

12054

++			break;

12055

++

12056

++		if (sd->next_active != NULL)

12057

++			/*

12058

++			 * The parent entity is still backlogged and

12059

++			 * the budgets on the path towards the root

12060

++			 * need to be updated.

12061

++			 */

12062

++			goto update;

12063

++

12064

++		/*

12065

++		 * If we reach there the parent is no more backlogged and

12066

++		 * we want to propagate the dequeue upwards.

12067

++		 */

12068

++		requeue = 1;

12069

++	}

12070

++

12071

++	return;

12072

++

12073

++update:

12074

++	entity = parent;

12075

++	for_each_entity(entity) {

12076

++		__bfq_activate_entity(entity);

12077

++

12078

++		sd = entity->sched_data;

12079

++		if (!bfq_update_next_active(sd))

12080

++			break;

12081

++	}

12082

++}

12083

++

12084

++/**

12085

++ * bfq_update_vtime - update vtime if necessary.

12086

++ * @st: the service tree to act upon.

12087

++ *

12088

++ * If necessary update the service tree vtime to have at least one

12089

++ * eligible entity, skipping to its start time.  Assumes that the

12090

++ * active tree of the device is not empty.

12091

++ *

12092

++ * NOTE: this hierarchical implementation updates vtimes quite often,

12093

++ * we may end up with reactivated tasks getting timestamps after a

12094

++ * vtime skip done because we needed a ->first_active entity on some

12095

++ * intermediate node.

12096

++ */

12097

++static void bfq_update_vtime(struct bfq_service_tree *st)

12098

++{

12099

++	struct bfq_entity *entry;

12100

++	struct rb_node *node = st->active.rb_node;

12101

++

12102

++	entry = rb_entry(node, struct bfq_entity, rb_node);

12103

++	if (bfq_gt(entry->min_start, st->vtime)) {

12104

++		st->vtime = entry->min_start;

12105

++		bfq_forget_idle(st);

12106

++	}

12107

++}

12108

++

12109

++/**

12110

++ * bfq_first_active - find the eligible entity with the smallest finish time

12111

++ * @st: the service tree to select from.

12112

++ *

12113

++ * This function searches the first schedulable entity, starting from the

12114

++ * root of the tree and going on the left every time on this side there is

12115

++ * a subtree with at least one eligible (start >= vtime) entity.  The path

12116

++ * on the right is followed only if a) the left subtree contains no eligible

12117

++ * entities and b) no eligible entity has been found yet.

12118

++ */

12119

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

12120

++{

12121

++	struct bfq_entity *entry, *first = NULL;

12122

++	struct rb_node *node = st->active.rb_node;

12123

++

12124

++	while (node != NULL) {

12125

++		entry = rb_entry(node, struct bfq_entity, rb_node);

12126

++left:

12127

++		if (!bfq_gt(entry->start, st->vtime))

12128

++			first = entry;

12129

++

12130

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

12131

++

12132

++		if (node->rb_left != NULL) {

12133

++			entry = rb_entry(node->rb_left,

12134

++					 struct bfq_entity, rb_node);

12135

++			if (!bfq_gt(entry->min_start, st->vtime)) {

12136

++				node = node->rb_left;

12137

++				goto left;

12138

++			}

12139

++		}

12140

++		if (first != NULL)

12141

++			break;

12142

++		node = node->rb_right;

12143

++	}

12144

++

12145

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

12146

++	return first;

12147

++}

12148

++

12149

++/**

12150

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

12151

++ * @st: the service tree.

12152

++ *

12153

++ * Update the virtual time in @st and return the first eligible entity

12154

++ * it contains.

12155

++ */

12156

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

12157

++						   bool force)

12158

++{

12159

++	struct bfq_entity *entity, *new_next_active = NULL;

12160

++

12161

++	if (RB_EMPTY_ROOT(&st->active))

12162

++		return NULL;

12163

++

12164

++	bfq_update_vtime(st);

12165

++	entity = bfq_first_active_entity(st);

12166

++	BUG_ON(bfq_gt(entity->start, st->vtime));

12167

++

12168

++	/*

12169

++	 * If the chosen entity does not match with the sched_data's

12170

++	 * next_active and we are forcedly serving the IDLE priority

12171

++	 * class tree, bubble up budget update.

12172

++	 */

12173

++	if (unlikely(force && entity != entity->sched_data->next_active)) {

12174

++		new_next_active = entity;

12175

++		for_each_entity(new_next_active)

12176

++			bfq_update_budget(new_next_active);

12177

++	}

12178

++

12179

++	return entity;

12180

++}

12181

++

12182

++/**

12183

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

12184

++ * @sd: the sched_data.

12185

++ * @extract: if true the returned entity will be also extracted from @sd.

12186

++ *

12187

++ * NOTE: since we cache the next_active entity at each level of the

12188

++ * hierarchy, the complexity of the lookup can be decreased with

12189

++ * absolutely no effort just returning the cached next_active value;

12190

++ * we prefer to do full lookups to test the consistency of * the data

12191

++ * structures.

12192

++ */

12193

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

12194

++						 int extract,

12195

++						 struct bfq_data *bfqd)

12196

++{

12197

++	struct bfq_service_tree *st = sd->service_tree;

12198

++	struct bfq_entity *entity;

12199

++	int i=0;

12200

++

12201

++	BUG_ON(sd->active_entity != NULL);

12202

++

12203

++	if (bfqd != NULL &&

12204

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

12205

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);

12206

++		if (entity != NULL) {

12207

++			i = BFQ_IOPRIO_CLASSES - 1;

12208

++			bfqd->bfq_class_idle_last_service = jiffies;

12209

++			sd->next_active = entity;

12210

++		}

12211

++	}

12212

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

12213

++		entity = __bfq_lookup_next_entity(st + i, false);

12214

++		if (entity != NULL) {

12215

++			if (extract) {

12216

++				bfq_check_next_active(sd, entity);

12217

++				bfq_active_extract(st + i, entity);

12218

++				sd->active_entity = entity;

12219

++				sd->next_active = NULL;

12220

++			}

12221

++			break;

12222

++		}

12223

++	}

12224

++

12225

++	return entity;

12226

++}

12227

++

12228

++/*

12229

++ * Get next queue for service.

12230

++ */

12231

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

12232

++{

12233

++	struct bfq_entity *entity = NULL;

12234

++	struct bfq_sched_data *sd;

12235

++	struct bfq_queue *bfqq;

12236

++

12237

++	BUG_ON(bfqd->active_queue != NULL);

12238

++

12239

++	if (bfqd->busy_queues == 0)

12240

++		return NULL;

12241

++

12242

++	sd = &bfqd->root_group->sched_data;

12243

++	for (; sd != NULL; sd = entity->my_sched_data) {

12244

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

12245

++		BUG_ON(entity == NULL);

12246

++		entity->service = 0;

12247

++	}

12248

++

12249

++	bfqq = bfq_entity_to_bfqq(entity);

12250

++	BUG_ON(bfqq == NULL);

12251

++

12252

++	return bfqq;

12253

++}

12254

++

12255

++/*

12256

++ * Forced extraction of the given queue.

12257

++ */

12258

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

12259

++				      struct bfq_queue *bfqq)

12260

++{

12261

++	struct bfq_entity *entity;

12262

++	struct bfq_sched_data *sd;

12263

++

12264

++	BUG_ON(bfqd->active_queue != NULL);

12265

++

12266

++	entity = &bfqq->entity;

12267

++	/*

12268

++	 * Bubble up extraction/update from the leaf to the root.

12269

++	*/

12270

++	for_each_entity(entity) {

12271

++		sd = entity->sched_data;

12272

++		bfq_update_budget(entity);

12273

++		bfq_update_vtime(bfq_entity_service_tree(entity));

12274

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

12275

++		sd->active_entity = entity;

12276

++		sd->next_active = NULL;

12277

++		entity->service = 0;

12278

++	}

12279

++

12280

++	return;

12281

++}

12282

++

12283

++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

12284

++{

12285

++	if (bfqd->active_bic != NULL) {

12286

++		put_io_context(bfqd->active_bic->icq.ioc);

12287

++		bfqd->active_bic = NULL;

12288

++	}

12289

++

12290

++	bfqd->active_queue = NULL;

12291

++	del_timer(&bfqd->idle_slice_timer);

12292

++}

12293

++

12294

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12295

++				int requeue)

12296

++{

12297

++	struct bfq_entity *entity = &bfqq->entity;

12298

++

12299

++	if (bfqq == bfqd->active_queue)

12300

++		__bfq_bfqd_reset_active(bfqd);

12301

++

12302

++	bfq_deactivate_entity(entity, requeue);

12303

++}

12304

++

12305

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

12306

++{

12307

++	struct bfq_entity *entity = &bfqq->entity;

12308

++

12309

++	bfq_activate_entity(entity);

12310

++}

12311

++

12312

++/*

12313

++ * Called when the bfqq no longer has requests pending, remove it from

12314

++ * the service tree.

12315

++ */

12316

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

12317

++			      int requeue)

12318

++{

12319

++	BUG_ON(!bfq_bfqq_busy(bfqq));

12320

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

12321

++

12322

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

12323

++

12324

++	bfq_clear_bfqq_busy(bfqq);

12325

++

12326

++	BUG_ON(bfqd->busy_queues == 0);

12327

++	bfqd->busy_queues--;

12328

++

12329

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

12330

++}

12331

++

12332

++/*

12333

++ * Called when an inactive queue receives a new request.

12334

++ */

12335

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

12336

++{

12337

++	BUG_ON(bfq_bfqq_busy(bfqq));

12338

++	BUG_ON(bfqq == bfqd->active_queue);

12339

++

12340

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

12341

++

12342

++	bfq_activate_bfqq(bfqd, bfqq);

12343

++

12344

++	bfq_mark_bfqq_busy(bfqq);

12345

++	bfqd->busy_queues++;

12346

++}

12347

+diff --git a/block/bfq.h b/block/bfq.h

12348

+new file mode 100644

12349

+index 0000000..48ecde9

12350

+--- /dev/null

12351

++++ b/block/bfq.h

12352

+@@ -0,0 +1,603 @@

12353

++/*

12354

++ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.

12355

++ *

12356

++ * Based on ideas and code from CFQ:

12357

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

12358

++ *

12359

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

12360

++ *		      Paolo Valente <paolo.valente@×××××××.it>

12361

++ *

12362

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

12363

++ */

12364

++

12365

++#ifndef _BFQ_H

12366

++#define _BFQ_H

12367

++

12368

++#include <linux/blktrace_api.h>

12369

++#include <linux/hrtimer.h>

12370

++#include <linux/ioprio.h>

12371

++#include <linux/rbtree.h>

12372

++

12373

++#define BFQ_IOPRIO_CLASSES	3

12374

++#define BFQ_CL_IDLE_TIMEOUT	HZ/5

12375

++

12376

++#define BFQ_MIN_WEIGHT	1

12377

++#define BFQ_MAX_WEIGHT	1000

12378

++

12379

++#define BFQ_DEFAULT_GRP_WEIGHT	10

12380

++#define BFQ_DEFAULT_GRP_IOPRIO	0

12381

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

12382

++

12383

++struct bfq_entity;

12384

++

12385

++/**

12386

++ * struct bfq_service_tree - per ioprio_class service tree.

12387

++ * @active: tree for active entities (i.e., those backlogged).

12388

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

12389

++ * @first_idle: idle entity with minimum F_i.

12390

++ * @last_idle: idle entity with maximum F_i.

12391

++ * @vtime: scheduler virtual time.

12392

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

12393

++ *

12394

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

12395

++ * ioprio_class has its own independent scheduler, and so its own

12396

++ * bfq_service_tree.  All the fields are protected by the queue lock

12397

++ * of the containing bfqd.

12398

++ */

12399

++struct bfq_service_tree {

12400

++	struct rb_root active;

12401

++	struct rb_root idle;

12402

++

12403

++	struct bfq_entity *first_idle;

12404

++	struct bfq_entity *last_idle;

12405

++

12406

++	u64 vtime;

12407

++	unsigned long wsum;

12408

++};

12409

++

12410

++/**

12411

++ * struct bfq_sched_data - multi-class scheduler.

12412

++ * @active_entity: entity under service.

12413

++ * @next_active: head-of-the-line entity in the scheduler.

12414

++ * @service_tree: array of service trees, one per ioprio_class.

12415

++ *

12416

++ * bfq_sched_data is the basic scheduler queue.  It supports three

12417

++ * ioprio_classes, and can be used either as a toplevel queue or as

12418

++ * an intermediate queue on a hierarchical setup.

12419

++ * @next_active points to the active entity of the sched_data service

12420

++ * trees that will be scheduled next.

12421

++ *

12422

++ * The supported ioprio_classes are the same as in CFQ, in descending

12423

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

12424

++ * Requests from higher priority queues are served before all the

12425

++ * requests from lower priority queues; among requests of the same

12426

++ * queue requests are served according to B-WF2Q+.

12427

++ * All the fields are protected by the queue lock of the containing bfqd.

12428

++ */

12429

++struct bfq_sched_data {

12430

++	struct bfq_entity *active_entity;

12431

++	struct bfq_entity *next_active;

12432

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

12433

++};

12434

++

12435

++/**

12436

++ * struct bfq_entity - schedulable entity.

12437

++ * @rb_node: service_tree member.

12438

++ * @on_st: flag, true if the entity is on a tree (either the active or

12439

++ *         the idle one of its service_tree).

12440

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

12441

++ * @start: B-WF2Q+ start timestamp (aka S_i).

12442

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

12443

++ * @min_start: minimum start time of the (active) subtree rooted at

12444

++ *             this entity; used for O(log N) lookups into active trees.

12445

++ * @service: service received during the last round of service.

12446

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

12447

++ * @weight: weight of the queue

12448

++ * @parent: parent entity, for hierarchical scheduling.

12449

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

12450

++ *                 associated scheduler queue, %NULL on leaf nodes.

12451

++ * @sched_data: the scheduler queue this entity belongs to.

12452

++ * @ioprio: the ioprio in use.

12453

++ * @new_weight: when a weight change is requested, the new weight value.

12454

++ * @orig_weight: original weight, used to implement weight boosting

12455

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

12456

++ * @ioprio_class: the ioprio_class in use.

12457

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

12458

++ *                    ioprio_class value.

12459

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

12460

++ *                  ioprio_class change.

12461

++ *

12462

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

12463

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

12464

++ * entity belongs to the sched_data of the parent group in the cgroup

12465

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

12466

++ * in @my_sched_data.

12467

++ *

12468

++ * Each entity stores independently its priority values; this would

12469

++ * allow different weights on different devices, but this

12470

++ * functionality is not exported to userspace by now.  Priorities and

12471

++ * weights are updated lazily, first storing the new values into the

12472

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

12473

++ * there is a transition in the entity state that allows the priority

12474

++ * update to take place the effective and the requested priority

12475

++ * values are synchronized.

12476

++ *

12477

++ * Unless cgroups are used, the weight value is calculated from the

12478

++ * ioprio to export the same interface as CFQ.  When dealing with

12479

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

12480

++ * time to consume their budget and have true sequential behavior, and

12481

++ * when there are no external factors breaking anticipation) the

12482

++ * relative weights at each level of the cgroups hierarchy should be

12483

++ * guaranteed.  All the fields are protected by the queue lock of the

12484

++ * containing bfqd.

12485

++ */

12486

++struct bfq_entity {

12487

++	struct rb_node rb_node;

12488

++

12489

++	int on_st;

12490

++

12491

++	u64 finish;

12492

++	u64 start;

12493

++

12494

++	struct rb_root *tree;

12495

++

12496

++	u64 min_start;

12497

++

12498

++	unsigned long service, budget;

12499

++	unsigned short weight, new_weight;

12500

++	unsigned short orig_weight;

12501

++

12502

++	struct bfq_entity *parent;

12503

++

12504

++	struct bfq_sched_data *my_sched_data;

12505

++	struct bfq_sched_data *sched_data;

12506

++

12507

++	unsigned short ioprio, new_ioprio;

12508

++	unsigned short ioprio_class, new_ioprio_class;

12509

++

12510

++	int ioprio_changed;

12511

++};

12512

++

12513

++struct bfq_group;

12514

++

12515

++/**

12516

++ * struct bfq_queue - leaf schedulable entity.

12517

++ * @ref: reference counter.

12518

++ * @bfqd: parent bfq_data.

12519

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

12520

++ *           one or more other queues.

12521

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

12522

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

12523

++ * @sort_list: sorted list of pending requests.

12524

++ * @next_rq: if fifo isn't expired, next request to serve.

12525

++ * @queued: nr of requests queued in @sort_list.

12526

++ * @allocated: currently allocated requests.

12527

++ * @meta_pending: pending metadata requests.

12528

++ * @fifo: fifo list of requests in sort_list.

12529

++ * @entity: entity representing this queue in the scheduler.

12530

++ * @max_budget: maximum budget allowed from the feedback mechanism.

12531

++ * @budget_timeout: budget expiration (in jiffies).

12532

++ * @dispatched: number of requests on the dispatch list or inside driver.

12533

++ * @org_ioprio: saved ioprio during boosted periods.

12534

++ * @flags: status flags.

12535

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

12536

++ * @seek_samples: number of seeks sampled

12537

++ * @seek_total: sum of the distances of the seeks sampled

12538

++ * @seek_mean: mean seek distance

12539

++ * @last_request_pos: position of the last request enqueued

12540

++ * @pid: pid of the process owning the queue, used for logging purposes.

12541

++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt

12542

++ * @raising_cur_max_time: current max raising time for this queue

12543

++ *

12544

++ * A bfq_queue is a leaf request queue; it can be associated to an io_context

12545

++ * or more (if it is an async one).  @cgroup holds a reference to the

12546

++ * cgroup, to be sure that it does not disappear while a bfqq still

12547

++ * references it (mostly to avoid races between request issuing and task

12548

++ * migration followed by cgroup distruction).

12549

++ * All the fields are protected by the queue lock of the containing bfqd.

12550

++ */

12551

++struct bfq_queue {

12552

++	atomic_t ref;

12553

++	struct bfq_data *bfqd;

12554

++

12555

++	/* fields for cooperating queues handling */

12556

++	struct bfq_queue *new_bfqq;

12557

++	struct rb_node pos_node;

12558

++	struct rb_root *pos_root;

12559

++

12560

++	struct rb_root sort_list;

12561

++	struct request *next_rq;

12562

++	int queued[2];

12563

++	int allocated[2];

12564

++	int meta_pending;

12565

++	struct list_head fifo;

12566

++

12567

++	struct bfq_entity entity;

12568

++

12569

++	unsigned long max_budget;

12570

++	unsigned long budget_timeout;

12571

++

12572

++	int dispatched;

12573

++

12574

++	unsigned short org_ioprio;

12575

++

12576

++	unsigned int flags;

12577

++

12578

++	struct list_head bfqq_list;

12579

++

12580

++	unsigned int seek_samples;

12581

++	u64 seek_total;

12582

++	sector_t seek_mean;

12583

++	sector_t last_request_pos;

12584

++

12585

++	pid_t pid;

12586

++

12587

++	/* weight-raising fields */

12588

++	unsigned int raising_cur_max_time;

12589

++	u64 last_rais_start_finish, soft_rt_next_start;

12590

++	unsigned int raising_coeff;

12591

++};

12592

++

12593

++/**

12594

++ * struct bfq_ttime - per process thinktime stats.

12595

++ * @ttime_total: total process thinktime

12596

++ * @ttime_samples: number of thinktime samples

12597

++ * @ttime_mean: average process thinktime

12598

++ */

12599

++struct bfq_ttime {

12600

++	unsigned long last_end_request;

12601

++

12602

++	unsigned long ttime_total;

12603

++	unsigned long ttime_samples;

12604

++	unsigned long ttime_mean;

12605

++};

12606

++

12607

++/**

12608

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

12609

++ * @icq: associated io_cq structure

12610

++ * @bfqq: array of two process queues, the sync and the async

12611

++ * @ttime: associated @bfq_ttime struct

12612

++ */

12613

++struct bfq_io_cq {

12614

++	struct io_cq icq; /* must be the first member */

12615

++	struct bfq_queue *bfqq[2];

12616

++	struct bfq_ttime ttime;

12617

++	int ioprio;

12618

++};

12619

++

12620

++/**

12621

++ * struct bfq_data - per device data structure.

12622

++ * @queue: request queue for the managed device.

12623

++ * @root_group: root bfq_group for the device.

12624

++ * @rq_pos_tree: rbtree sorted by next_request position,

12625

++ *		used when determining if two or more queues

12626

++ *		have interleaving requests (see bfq_close_cooperator).

12627

++ * @busy_queues: number of bfq_queues containing requests (including the

12628

++ *		 queue under service, even if it is idling).

12629

++ * @queued: number of queued requests.

12630

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

12631

++ * @sync_flight: number of sync requests in the driver.

12632

++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples

12633

++ *		      completed requests .

12634

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

12635

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

12636

++ * @budgets_assigned: number of budgets assigned.

12637

++ * @idle_slice_timer: timer set when idling for the next sequential request

12638

++ *                    from the queue under service.

12639

++ * @unplug_work: delayed work to restart dispatching on the request queue.

12640

++ * @active_queue: bfq_queue under service.

12641

++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.

12642

++ * @last_position: on-disk position of the last served request.

12643

++ * @last_budget_start: beginning of the last budget.

12644

++ * @last_idling_start: beginning of the last idle slice.

12645

++ * @peak_rate: peak transfer rate observed for a budget.

12646

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

12647

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.

12648

++ * @group_list: list of all the bfq_groups active on the device.

12649

++ * @active_list: list of all the bfq_queues active on the device.

12650

++ * @idle_list: list of all the bfq_queues idle on the device.

12651

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

12652

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

12653

++ *                   requests are served in fifo order.

12654

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

12655

++ * @bfq_back_max: maximum allowed backward seek.

12656

++ * @bfq_slice_idle: maximum idling time.

12657

++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).

12658

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

12659

++ *                           async queues.

12660

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

12661

++ *               to prevent seeky queues to impose long latencies to well

12662

++ *               behaved ones (this also implies that seeky queues cannot

12663

++ *               receive guarantees in the service domain; after a timeout

12664

++ *               they are charged for the whole allocated budget, to try

12665

++ *               to preserve a behavior reasonably fair among them, but

12666

++ *               without service-domain guarantees).

12667

++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted

12668

++ *                            queue is multiplied

12669

++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)

12670

++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes

12671

++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising

12672

++ *			       may be reactivated for a queue (in jiffies)

12673

++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals

12674

++ *				     after which weight-raising may be

12675

++ *				     reactivated for an already busy queue

12676

++ *				     (in jiffies)

12677

++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,

12678

++ *			         sectors per seconds

12679

++ * @RT_prod: cached value of the product R*T used for computing the maximum

12680

++ * 	     duration of the weight raising automatically

12681

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions

12682

++ *

12683

++ * All the fields are protected by the @queue lock.

12684

++ */

12685

++struct bfq_data {

12686

++	struct request_queue *queue;

12687

++

12688

++	struct bfq_group *root_group;

12689

++

12690

++	struct rb_root rq_pos_tree;

12691

++

12692

++	int busy_queues;

12693

++	int queued;

12694

++	int rq_in_driver;

12695

++	int sync_flight;

12696

++

12697

++	int max_rq_in_driver;

12698

++	int hw_tag_samples;

12699

++	int hw_tag;

12700

++

12701

++	int budgets_assigned;

12702

++

12703

++	struct timer_list idle_slice_timer;

12704

++	struct work_struct unplug_work;

12705

++

12706

++	struct bfq_queue *active_queue;

12707

++	struct bfq_io_cq *active_bic;

12708

++

12709

++	sector_t last_position;

12710

++

12711

++	ktime_t last_budget_start;

12712

++	ktime_t last_idling_start;

12713

++	int peak_rate_samples;

12714

++	u64 peak_rate;

12715

++	unsigned long bfq_max_budget;

12716

++

12717

++	struct hlist_head group_list;

12718

++	struct list_head active_list;

12719

++	struct list_head idle_list;

12720

++

12721

++	unsigned int bfq_quantum;

12722

++	unsigned int bfq_fifo_expire[2];

12723

++	unsigned int bfq_back_penalty;

12724

++	unsigned int bfq_back_max;

12725

++	unsigned int bfq_slice_idle;

12726

++	u64 bfq_class_idle_last_service;

12727

++

12728

++	unsigned int bfq_user_max_budget;

12729

++	unsigned int bfq_max_budget_async_rq;

12730

++	unsigned int bfq_timeout[2];

12731

++

12732

++	bool low_latency;

12733

++

12734

++	/* parameters of the low_latency heuristics */

12735

++	unsigned int bfq_raising_coeff;

12736

++	unsigned int bfq_raising_max_time;

12737

++	unsigned int bfq_raising_rt_max_time;

12738

++	unsigned int bfq_raising_min_idle_time;

12739

++	unsigned int bfq_raising_min_inter_arr_async;

12740

++	unsigned int bfq_raising_max_softrt_rate;

12741

++	u64 RT_prod;

12742

++

12743

++	struct bfq_queue oom_bfqq;

12744

++};

12745

++

12746

++enum bfqq_state_flags {

12747

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */

12748

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

12749

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

12750

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

12751

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

12752

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

12753

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

12754

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

12755

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

12756

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

12757

++	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

12758

++};

12759

++

12760

++#define BFQ_BFQQ_FNS(name)						\

12761

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

12762

++{									\

12763

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

12764

++}									\

12765

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

12766

++{									\

12767

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

12768

++}									\

12769

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

12770

++{									\

12771

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

12772

++}

12773

++

12774

++BFQ_BFQQ_FNS(busy);

12775

++BFQ_BFQQ_FNS(wait_request);

12776

++BFQ_BFQQ_FNS(must_alloc);

12777

++BFQ_BFQQ_FNS(fifo_expire);

12778

++BFQ_BFQQ_FNS(idle_window);

12779

++BFQ_BFQQ_FNS(prio_changed);

12780

++BFQ_BFQQ_FNS(sync);

12781

++BFQ_BFQQ_FNS(budget_new);

12782

++BFQ_BFQQ_FNS(coop);

12783

++BFQ_BFQQ_FNS(split_coop);

12784

++BFQ_BFQQ_FNS(some_coop_idle);

12785

++#undef BFQ_BFQQ_FNS

12786

++

12787

++/* Logging facilities. */

12788

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

12789

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

12790

++

12791

++#define bfq_log(bfqd, fmt, args...) \

12792

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

12793

++

12794

++/* Expiration reasons. */

12795

++enum bfqq_expiration {

12796

++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */

12797

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

12798

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

12799

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

12800

++};

12801

++

12802

++#ifdef CONFIG_CGROUP_BFQIO

12803

++/**

12804

++ * struct bfq_group - per (device, cgroup) data structure.

12805

++ * @entity: schedulable entity to insert into the parent group sched_data.

12806

++ * @sched_data: own sched_data, to contain child entities (they may be

12807

++ *              both bfq_queues and bfq_groups).

12808

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

12809

++ *              list of the containing cgroup's bfqio_cgroup.

12810

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

12811

++ *             of the groups active on the same device; used for cleanup.

12812

++ * @bfqd: the bfq_data for the device this group acts upon.

12813

++ * @async_bfqq: array of async queues for all the tasks belonging to

12814

++ *              the group, one queue per ioprio value per ioprio_class,

12815

++ *              except for the idle class that has only one queue.

12816

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

12817

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

12818

++ *             to avoid too many special cases during group creation/migration.

12819

++ *

12820

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

12821

++ * there is a set of bfq_groups, each one collecting the lower-level

12822

++ * entities belonging to the group that are acting on the same device.

12823

++ *

12824

++ * Locking works as follows:

12825

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

12826

++ *      via RCU from its readers.

12827

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

12828

++ *      from the readers.

12829

++ *    o All the other fields are protected by the @bfqd queue lock.

12830

++ */

12831

++struct bfq_group {

12832

++	struct bfq_entity entity;

12833

++	struct bfq_sched_data sched_data;

12834

++

12835

++	struct hlist_node group_node;

12836

++	struct hlist_node bfqd_node;

12837

++

12838

++	void *bfqd;

12839

++

12840

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12841

++	struct bfq_queue *async_idle_bfqq;

12842

++

12843

++	struct bfq_entity *my_entity;

12844

++};

12845

++

12846

++/**

12847

++ * struct bfqio_cgroup - bfq cgroup data structure.

12848

++ * @css: subsystem state for bfq in the containing cgroup.

12849

++ * @weight: cgroup weight.

12850

++ * @ioprio: cgroup ioprio.

12851

++ * @ioprio_class: cgroup ioprio_class.

12852

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

12853

++ * @group_data: list containing the bfq_group belonging to this cgroup.

12854

++ *

12855

++ * @group_data is accessed using RCU, with @lock protecting the updates,

12856

++ * @ioprio and @ioprio_class are protected by @lock.

12857

++ */

12858

++struct bfqio_cgroup {

12859

++	struct cgroup_subsys_state css;

12860

++

12861

++	unsigned short weight, ioprio, ioprio_class;

12862

++

12863

++	spinlock_t lock;

12864

++	struct hlist_head group_data;

12865

++};

12866

++#else

12867

++struct bfq_group {

12868

++	struct bfq_sched_data sched_data;

12869

++

12870

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

12871

++	struct bfq_queue *async_idle_bfqq;

12872

++};

12873

++#endif

12874

++

12875

++static inline struct bfq_service_tree *

12876

++bfq_entity_service_tree(struct bfq_entity *entity)

12877

++{

12878

++	struct bfq_sched_data *sched_data = entity->sched_data;

12879

++	unsigned int idx = entity->ioprio_class - 1;

12880

++

12881

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

12882

++	BUG_ON(sched_data == NULL);

12883

++

12884

++	return sched_data->service_tree + idx;

12885

++}

12886

++

12887

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

12888

++					    int is_sync)

12889

++{

12890

++	return bic->bfqq[!!is_sync];

12891

++}

12892

++

12893

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

12894

++				struct bfq_queue *bfqq, int is_sync)

12895

++{

12896

++	bic->bfqq[!!is_sync] = bfqq;

12897

++}

12898

++

12899

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

12900

++{

12901

++	return bic->icq.q->elevator->elevator_data;

12902

++}

12903

++

12904

++/**

12905

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

12906

++ * @ptr: a pointer to a bfqd.

12907

++ * @flags: storage for the flags to be saved.

12908

++ *

12909

++ * This function allows bfqg->bfqd to be protected by the

12910

++ * queue lock of the bfqd they reference; the pointer is dereferenced

12911

++ * under RCU, so the storage for bfqd is assured to be safe as long

12912

++ * as the RCU read side critical section does not end.  After the

12913

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

12914

++ * sure that no other writer accessed it.  If we raced with a writer,

12915

++ * the function returns NULL, with the queue unlocked, otherwise it

12916

++ * returns the dereferenced pointer, with the queue locked.

12917

++ */

12918

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

12919

++						   unsigned long *flags)

12920

++{

12921

++	struct bfq_data *bfqd;

12922

++

12923

++	rcu_read_lock();

12924

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

12925

++

12926

++	if (bfqd != NULL) {

12927

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

12928

++		if (*ptr == bfqd)

12929

++			goto out;

12930

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12931

++	}

12932

++

12933

++	bfqd = NULL;

12934

++out:

12935

++	rcu_read_unlock();

12936

++	return bfqd;

12937

++}

12938

++

12939

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

12940

++				       unsigned long *flags)

12941

++{

12942

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

12943

++}

12944

++

12945

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

12946

++static void bfq_put_queue(struct bfq_queue *bfqq);

12947

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

12948

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

12949

++				       struct bfq_group *bfqg, int is_sync,

12950

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

12951

++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,

12952

++					 struct bfq_group *bfqg);

12953

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

12954

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

12955

++#endif

12956

+--

12957

+1.8.1.4

12958

+

12959

12960

Added: genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1

12961

===================================================================

12962

--- genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	                        (rev 0)

12963

+++ genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1	2013-11-04 10:09:31 UTC (rev 2565)

12964

@@ -0,0 +1,1049 @@

12965

+From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001

12966

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

12967

+Date: Fri, 14 Jun 2013 13:46:47 +0200

12968

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for

12969

+ 3.11.0

12970

+

12971

+A set of processes may happen  to  perform interleaved reads, i.e., requests

12972

+whose union would give rise to a  sequential read  pattern.  There   are two

12973

+typical  cases: in the first  case,   processes  read  fixed-size  chunks of

12974

+data at a fixed distance from each other, while in the second case processes

12975

+may read variable-size chunks at  variable distances. The latter case occurs

12976

+for  example with  KVM, which  splits the  I/O generated  by the  guest into

12977

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

12978

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the  first

12979

+available  process. CFQ  uses actual  queue merging  for the  first type  of

12980

+processes, whereas it  uses preemption to get a sequential  read pattern out

12981

+of the read requests  performed by the second type of  processes. In the end

12982

+it uses  two different  mechanisms to  achieve the  same goal:  boosting the

12983

+throughput with interleaved I/O.

12984

+

12985

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

12986

+sequential  read pattern  with both  types of  processes. The  main idea  is

12987

+checking newly arrived requests against the next request of the active queue

12988

+both in case of actual request insert and in case of request merge. By doing

12989

+so, both the types of processes can be handled by just merging their queues.

12990

+EQM is  then simpler and  more compact than the  pair of mechanisms  used in

12991

+CFQ.

12992

+

12993

+Finally, EQM  also preserves the  typical low-latency properties of  BFQ, by

12994

+properly restoring the weight-raising state of  a queue when it gets back to

12995

+a non-merged state.

12996

+

12997

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

12998

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

12999

+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>

13000

+---

13001

+ block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------

13002

+ block/bfq-sched.c   |  28 ---

13003

+ block/bfq.h         |  16 ++

13004

+ 3 files changed, 466 insertions(+), 231 deletions(-)

13005

+

13006

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

13007

+index 0ed2746..bbe79fb 100644

13008

+--- a/block/bfq-iosched.c

13009

++++ b/block/bfq-iosched.c

13010

+@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)

13011

+ 	return dur;

13012

+ }

13013

+

13014

++static inline void

13015

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

13016

++{

13017

++	if (bic->saved_idle_window)

13018

++		bfq_mark_bfqq_idle_window(bfqq);

13019

++	else

13020

++		bfq_clear_bfqq_idle_window(bfqq);

13021

++	if (bic->raising_time_left && bfqq->bfqd->low_latency) {

13022

++		/*

13023

++		 * Start a weight raising period with the duration given by

13024

++		 * the raising_time_left snapshot.

13025

++		 */

13026

++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;

13027

++		bfqq->raising_cur_max_time = bic->raising_time_left;

13028

++		bfqq->last_rais_start_finish = jiffies;

13029

++	}

13030

++	/*

13031

++	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from

13032

++	 * getting confused about the queue's need of a weight-raising

13033

++	 * period.

13034

++	 */

13035

++	bic->raising_time_left = 0;

13036

++}

13037

++

13038

++/*

13039

++ * Must be called with the queue_lock held.

13040

++ */

13041

++static int bfqq_process_refs(struct bfq_queue *bfqq)

13042

++{

13043

++	int process_refs, io_refs;

13044

++

13045

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13046

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13047

++	BUG_ON(process_refs < 0);

13048

++	return process_refs;

13049

++}

13050

++

13051

+ static void bfq_add_rq_rb(struct request *rq)

13052

+ {

13053

+ 	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13054

+@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)

13055

+ 		if (! bfqd->low_latency)

13056

+ 			goto add_bfqq_busy;

13057

+

13058

++		if (bfq_bfqq_just_split(bfqq))

13059

++			goto set_ioprio_changed;

13060

++

13061

+ 		/*

13062

+-		 * If the queue is not being boosted and has been idle

13063

+-		 * for enough time, start a weight-raising period

13064

++		 * If the queue:

13065

++		 * - is not being boosted,

13066

++		 * - has been idle for enough time,

13067

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

13068

++		 *   shared "for its nature" or it is not shared and its

13069

++		 *   requests have not been redirected to a shared queue)

13070

++		 * start a weight-raising period.

13071

+ 		 */

13072

+-		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {

13073

++		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&

13074

++		   (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

13075

+ 			bfqq->raising_coeff = bfqd->bfq_raising_coeff;

13076

+ 			if (idle_for_long_time)

13077

+ 				bfqq->raising_cur_max_time =

13078

+@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)

13079

+ 						raising_cur_max_time));

13080

+ 				}

13081

+ 		}

13082

++set_ioprio_changed:

13083

+ 		if (old_raising_coeff != bfqq->raising_coeff)

13084

+ 			entity->ioprio_changed = 1;

13085

+ add_bfqq_busy:

13086

+@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)

13087

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

13088

+ }

13089

+

13090

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13091

+-			   struct bio *bio)

13092

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

13093

+ {

13094

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

13095

+-	struct bfq_io_cq *bic;

13096

+-	struct bfq_queue *bfqq;

13097

+-

13098

+-	/*

13099

+-	 * Disallow merge of a sync bio into an async request.

13100

+-	 */

13101

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13102

+-		return 0;

13103

+-

13104

+-	/*

13105

+-	 * Lookup the bfqq that this bio will be queued with. Allow

13106

+-	 * merge only if rq is queued there.

13107

+-	 * Queue lock is held here.

13108

+-	 */

13109

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

13110

+-	if (bic == NULL)

13111

+-		return 0;

13112

+-

13113

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13114

+-	return bfqq == RQ_BFQQ(rq);

13115

+-}

13116

+-

13117

+-static void __bfq_set_active_queue(struct bfq_data *bfqd,

13118

+-				   struct bfq_queue *bfqq)

13119

+-{

13120

+-	if (bfqq != NULL) {

13121

+-		bfq_mark_bfqq_must_alloc(bfqq);

13122

+-		bfq_mark_bfqq_budget_new(bfqq);

13123

+-		bfq_clear_bfqq_fifo_expire(bfqq);

13124

+-

13125

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13126

+-

13127

+-		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

13128

+-			     bfqq->entity.budget);

13129

+-	}

13130

+-

13131

+-	bfqd->active_queue = bfqq;

13132

+-}

13133

+-

13134

+-/*

13135

+- * Get and set a new active queue for service.

13136

+- */

13137

+-static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,

13138

+-					      struct bfq_queue *bfqq)

13139

+-{

13140

+-	if (!bfqq)

13141

+-		bfqq = bfq_get_next_queue(bfqd);

13142

++	if (request)

13143

++		return blk_rq_pos(io_struct);

13144

+ 	else

13145

+-		bfq_get_next_queue_forced(bfqd, bfqq);

13146

+-

13147

+-	__bfq_set_active_queue(bfqd, bfqq);

13148

+-	return bfqq;

13149

++		return ((struct bio *)io_struct)->bi_sector;

13150

+ }

13151

+

13152

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

13153

+-					  struct request *rq)

13154

++static inline sector_t bfq_dist_from(sector_t pos1,

13155

++				     sector_t pos2)

13156

+ {

13157

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

13158

+-		return blk_rq_pos(rq) - bfqd->last_position;

13159

++	if (pos1 >= pos2)

13160

++		return pos1 - pos2;

13161

+ 	else

13162

+-		return bfqd->last_position - blk_rq_pos(rq);

13163

++		return pos2 - pos1;

13164

+ }

13165

+

13166

+-/*

13167

+- * Return true if bfqq has no request pending and rq is close enough to

13168

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

13169

+- * bfqq->next_rq

13170

+- */

13171

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

13172

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

13173

++					 sector_t sector)

13174

+ {

13175

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

13176

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

13177

++	       BFQQ_SEEK_THR;

13178

+ }

13179

+

13180

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13181

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

13182

+ {

13183

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

13184

+ 	struct rb_node *parent, *node;

13185

+ 	struct bfq_queue *__bfqq;

13186

+-	sector_t sector = bfqd->last_position;

13187

+

13188

+ 	if (RB_EMPTY_ROOT(root))

13189

+ 		return NULL;

13190

+@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13191

+ 	 * position).

13192

+ 	 */

13193

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

13194

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13195

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13196

+ 		return __bfqq;

13197

+

13198

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

13199

+@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13200

+ 		return NULL;

13201

+

13202

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

13203

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

13204

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

13205

+ 		return __bfqq;

13206

+

13207

+ 	return NULL;

13208

+@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

13209

+ /*

13210

+  * bfqd - obvious

13211

+  * cur_bfqq - passed in so that we don't decide that the current queue

13212

+- *            is closely cooperating with itself.

13213

+- *

13214

+- * We are assuming that cur_bfqq has dispatched at least one request,

13215

+- * and that bfqd->last_position reflects a position on the disk associated

13216

+- * with the I/O issued by cur_bfqq.

13217

++ *            is closely cooperating with itself

13218

++ * sector - used as a reference point to search for a close queue

13219

+  */

13220

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13221

+-					      struct bfq_queue *cur_bfqq)

13222

++					      struct bfq_queue *cur_bfqq,

13223

++					      sector_t sector)

13224

+ {

13225

+ 	struct bfq_queue *bfqq;

13226

+

13227

+@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13228

+ 	 * working closely on the same area of the disk. In that case,

13229

+ 	 * we can group them together and don't waste time idling.

13230

+ 	 */

13231

+-	bfqq = bfqq_close(bfqd);

13232

++	bfqq = bfqq_close(bfqd, sector);

13233

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

13234

+ 		return NULL;

13235

+

13236

+@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

13237

+ 	return bfqq;

13238

+ }

13239

+

13240

++static struct bfq_queue *

13241

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13242

++{

13243

++	int process_refs, new_process_refs;

13244

++	struct bfq_queue *__bfqq;

13245

++

13246

++	/*

13247

++	 * If there are no process references on the new_bfqq, then it is

13248

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13249

++	 * may have dropped their last reference (not just their last process

13250

++	 * reference).

13251

++	 */

13252

++	if (!bfqq_process_refs(new_bfqq))

13253

++		return NULL;

13254

++

13255

++	/* Avoid a circular list and skip interim queue merges. */

13256

++	while ((__bfqq = new_bfqq->new_bfqq)) {

13257

++		if (__bfqq == bfqq)

13258

++			return NULL;

13259

++		new_bfqq = __bfqq;

13260

++	}

13261

++

13262

++	process_refs = bfqq_process_refs(bfqq);

13263

++	new_process_refs = bfqq_process_refs(new_bfqq);

13264

++	/*

13265

++	 * If the process for the bfqq has gone away, there is no

13266

++	 * sense in merging the queues.

13267

++	 */

13268

++	if (process_refs == 0 || new_process_refs == 0)

13269

++		return NULL;

13270

++

13271

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13272

++		new_bfqq->pid);

13273

++

13274

++	/*

13275

++	 * Merging is just a redirection: the requests of the process owning

13276

++	 * one of the two queues are redirected to the other queue. The latter

13277

++	 * queue, in its turn, is set as shared if this is the first time that

13278

++	 * the requests of some process are redirected to it.

13279

++	 *

13280

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

13281

++	 * are in the context of the process owning bfqq, hence we have the

13282

++	 * io_cq of this process. So we can immediately configure this io_cq

13283

++	 * to redirect the requests of the process to new_bfqq.

13284

++	 *

13285

++	 * NOTE, even if new_bfqq coincides with the active queue, the io_cq of

13286

++	 * new_bfqq is not available, because, if the active queue is shared,

13287

++	 * bfqd->active_bic may not point to the io_cq of the active queue.

13288

++	 * Redirecting the requests of the process owning bfqq to the currently

13289

++	 * active queue is in any case the best option, as we feed the active queue

13290

++	 * with new requests close to the last request served and, by doing so,

13291

++	 * hopefully increase the throughput.

13292

++	 */

13293

++	bfqq->new_bfqq = new_bfqq;

13294

++	atomic_add(process_refs, &new_bfqq->ref);

13295

++	return new_bfqq;

13296

++}

13297

++

13298

++/*

13299

++ * Attempt to schedule a merge of bfqq with the currently active queue or

13300

++ * with a close queue among the scheduled queues.

13301

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

13302

++ * structure otherwise.

13303

++ */

13304

++static struct bfq_queue *

13305

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13306

++		     void *io_struct, bool request)

13307

++{

13308

++	struct bfq_queue *active_bfqq, *new_bfqq;

13309

++

13310

++	if (bfqq->new_bfqq)

13311

++		return bfqq->new_bfqq;

13312

++

13313

++	if (!io_struct)

13314

++		return NULL;

13315

++

13316

++	active_bfqq = bfqd->active_queue;

13317

++

13318

++	if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)

13319

++		goto check_scheduled;

13320

++

13321

++	if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))

13322

++		goto check_scheduled;

13323

++

13324

++	if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))

13325

++		goto check_scheduled;

13326

++

13327

++	if (active_bfqq->entity.parent != bfqq->entity.parent)

13328

++		goto check_scheduled;

13329

++

13330

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

13331

++	    bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))

13332

++		if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))

13333

++			return new_bfqq; /* Merge with the active queue */

13334

++

13335

++	/*

13336

++	 * Check whether there is a cooperator among currently scheduled

13337

++	 * queues. The only thing we need is that the bio/request is not

13338

++	 * NULL, as we need it to establish whether a cooperator exists.

13339

++	 */

13340

++check_scheduled:

13341

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

13342

++					bfq_io_struct_pos(io_struct, request));

13343

++	if (new_bfqq)

13344

++		return bfq_setup_merge(bfqq, new_bfqq);

13345

++

13346

++	return NULL;

13347

++}

13348

++

13349

++static inline void

13350

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

13351

++{

13352

++	/*

13353

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

13354

++	 * have already been redirected to a shared queue; both idle window

13355

++	 * and weight raising state have already been saved. Do nothing.

13356

++	 */

13357

++	if (bfqq->bic == NULL)

13358

++		return;

13359

++	if (bfqq->bic->raising_time_left)

13360

++		/*

13361

++		 * This is the queue of a just-started process, and would

13362

++		 * deserve weight raising: we set raising_time_left to the full

13363

++		 * weight-raising duration to trigger weight-raising when and

13364

++		 * if the queue is split and the first request of the queue

13365

++		 * is enqueued.

13366

++		 */

13367

++		bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);

13368

++	else if (bfqq->raising_coeff > 1) {

13369

++		unsigned long wrais_duration =

13370

++			jiffies - bfqq->last_rais_start_finish;

13371

++		/*

13372

++		 * It may happen that a queue's weight raising period lasts

13373

++		 * longer than its raising_cur_max_time, as weight raising is

13374

++		 * handled only when a request is enqueued or dispatched (it

13375

++		 * does not use any timer). If the weight raising period is

13376

++		 * about to end, don't save it.

13377

++		 */

13378

++		if (bfqq->raising_cur_max_time <= wrais_duration)

13379

++			bfqq->bic->raising_time_left = 0;

13380

++		else

13381

++			bfqq->bic->raising_time_left =

13382

++				bfqq->raising_cur_max_time - wrais_duration;

13383

++		/*

13384

++		 * The bfq_queue is becoming shared or the requests of the

13385

++		 * process owning the queue are being redirected to a shared

13386

++		 * queue. Stop the weight raising period of the queue, as in

13387

++		 * both cases it should not be owned by an interactive or soft

13388

++		 * real-time application.

13389

++		 */

13390

++		bfq_bfqq_end_raising(bfqq);

13391

++	} else

13392

++		bfqq->bic->raising_time_left = 0;

13393

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

13394

++}

13395

++

13396

++static inline void

13397

++bfq_get_bic_reference(struct bfq_queue *bfqq)

13398

++{

13399

++	/*

13400

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

13401

++	 * is about to begin using a shared bfq_queue.

13402

++	 */

13403

++	if (bfqq->bic)

13404

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

13405

++}

13406

++

13407

++static void

13408

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13409

++                struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13410

++{

13411

++        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13412

++		(long unsigned)new_bfqq->pid);

13413

++	/* Save weight raising and idle window of the merged queues */

13414

++	bfq_bfqq_save_state(bfqq);

13415

++	bfq_bfqq_save_state(new_bfqq);

13416

++	/*

13417

++	 * Grab a reference to the bic, to prevent it from being destroyed

13418

++	 * before being possibly touched by a bfq_split_bfqq().

13419

++	 */

13420

++	bfq_get_bic_reference(bfqq);

13421

++	bfq_get_bic_reference(new_bfqq);

13422

++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */

13423

++        bic_set_bfqq(bic, new_bfqq, 1);

13424

++        bfq_mark_bfqq_coop(new_bfqq);

13425

++	/*

13426

++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set

13427

++	 * new_bfqq->bic to NULL. bfqq either:

13428

++	 * - does not belong to any bic any more, and hence bfqq->bic must

13429

++	 *   be set to NULL, or

13430

++	 * - is a queue whose owning bics have already been redirected to a

13431

++	 *   different queue, hence the queue is destined to not belong to any

13432

++	 *   bic soon and bfqq->bic is already NULL (therefore the next

13433

++	 *   assignment causes no harm).

13434

++	 */

13435

++	new_bfqq->bic = NULL;

13436

++	bfqq->bic = NULL;

13437

++        bfq_put_queue(bfqq);

13438

++}

13439

++

13440

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

13441

++			   struct bio *bio)

13442

++{

13443

++	struct bfq_data *bfqd = q->elevator->elevator_data;

13444

++	struct bfq_io_cq *bic;

13445

++	struct bfq_queue *bfqq, *new_bfqq;

13446

++

13447

++	/*

13448

++	 * Disallow merge of a sync bio into an async request.

13449

++	 */

13450

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

13451

++		return 0;

13452

++

13453

++	/*

13454

++	 * Lookup the bfqq that this bio will be queued with. Allow

13455

++	 * merge only if rq is queued there.

13456

++	 * Queue lock is held here.

13457

++	 */

13458

++	bic = bfq_bic_lookup(bfqd, current->io_context);

13459

++	if (bic == NULL)

13460

++		return 0;

13461

++

13462

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

13463

++	/*

13464

++	 * We take advantage of this function to perform an early merge

13465

++	 * of the queues of possible cooperating processes.

13466

++	 */

13467

++	if (bfqq != NULL &&

13468

++	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {

13469

++		bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

13470

++		/*

13471

++		 * If we get here, the bio will be queued in the shared queue,

13472

++		 * i.e., new_bfqq, so use new_bfqq to decide whether bio and

13473

++		 * rq can be merged.

13474

++		 */

13475

++		bfqq = new_bfqq;

13476

++	}

13477

++

13478

++	return bfqq == RQ_BFQQ(rq);

13479

++}

13480

++

13481

++static void __bfq_set_active_queue(struct bfq_data *bfqd,

13482

++				   struct bfq_queue *bfqq)

13483

++{

13484

++	if (bfqq != NULL) {

13485

++		bfq_mark_bfqq_must_alloc(bfqq);

13486

++		bfq_mark_bfqq_budget_new(bfqq);

13487

++		bfq_clear_bfqq_fifo_expire(bfqq);

13488

++

13489

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

13490

++

13491

++		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",

13492

++			     bfqq->entity.budget);

13493

++	}

13494

++

13495

++	bfqd->active_queue = bfqq;

13496

++}

13497

++

13498

++/*

13499

++ * Get and set a new active queue for service.

13500

++ */

13501

++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)

13502

++{

13503

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

13504

++

13505

++	__bfq_set_active_queue(bfqd, bfqq);

13506

++	return bfqq;

13507

++}

13508

++

13509

+ /*

13510

+  * If enough samples have been computed, return the current max budget

13511

+  * stored in bfqd, which is dynamically updated according to the

13512

+@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

13513

+ 	return rq;

13514

+ }

13515

+

13516

+-/*

13517

+- * Must be called with the queue_lock held.

13518

+- */

13519

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

13520

+-{

13521

+-	int process_refs, io_refs;

13522

+-

13523

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

13524

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

13525

+-	BUG_ON(process_refs < 0);

13526

+-	return process_refs;

13527

+-}

13528

+-

13529

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

13530

+-{

13531

+-	int process_refs, new_process_refs;

13532

+-	struct bfq_queue *__bfqq;

13533

+-

13534

+-	/*

13535

+-	 * If there are no process references on the new_bfqq, then it is

13536

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

13537

+-	 * may have dropped their last reference (not just their last process

13538

+-	 * reference).

13539

+-	 */

13540

+-	if (!bfqq_process_refs(new_bfqq))

13541

+-		return;

13542

+-

13543

+-	/* Avoid a circular list and skip interim queue merges. */

13544

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

13545

+-		if (__bfqq == bfqq)

13546

+-			return;

13547

+-		new_bfqq = __bfqq;

13548

+-	}

13549

+-

13550

+-	process_refs = bfqq_process_refs(bfqq);

13551

+-	new_process_refs = bfqq_process_refs(new_bfqq);

13552

+-	/*

13553

+-	 * If the process for the bfqq has gone away, there is no

13554

+-	 * sense in merging the queues.

13555

+-	 */

13556

+-	if (process_refs == 0 || new_process_refs == 0)

13557

+-		return;

13558

+-

13559

+-	/*

13560

+-	 * Merge in the direction of the lesser amount of work.

13561

+-	 */

13562

+-	if (new_process_refs >= process_refs) {

13563

+-		bfqq->new_bfqq = new_bfqq;

13564

+-		atomic_add(process_refs, &new_bfqq->ref);

13565

+-	} else {

13566

+-		new_bfqq->new_bfqq = bfqq;

13567

+-		atomic_add(new_process_refs, &bfqq->ref);

13568

+-	}

13569

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

13570

+-		new_bfqq->pid);

13571

+-}

13572

+-

13573

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

13574

+ {

13575

+ 	struct bfq_entity *entity = &bfqq->entity;

13576

+@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

13577

+  *   is likely to boost the disk throughput);

13578

+  * - the queue is weight-raised (waiting for the request is necessary for

13579

+  *   providing the queue with fairness and latency guarantees).

13580

++ *

13581

++ * In any case, idling can be disabled for cooperation issues, if

13582

++ * 1) there is a close cooperator for the queue, or

13583

++ * 2) the queue is shared and some cooperator is likely to be idle (in this

13584

++ *    case, by not arming the idle timer, we try to slow down the queue, to

13585

++ *    prevent the zones of the disk accessed by the active cooperators to

13586

++ *    become too distant from the zone that will be accessed by the currently

13587

++ *    idle cooperators).

13588

+  */

13589

+ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

13590

+ 				      int budg_timeout)

13591

+@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

13592

+ 		(bfqd->rq_in_driver == 0 ||

13593

+ 				budg_timeout ||

13594

+                                 bfqq->raising_coeff > 1) &&

13595

+-                !bfq_close_cooperator(bfqd, bfqq) &&

13596

++                !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&

13597

+                 (!bfq_bfqq_coop(bfqq) ||

13598

+ 			!bfq_bfqq_some_coop_idle(bfqq)) &&

13599

+ 		!bfq_queue_nonrot_noidle(bfqd, bfqq));

13600

+@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,

13601

+  */

13602

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13603

+ {

13604

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

13605

++	struct bfq_queue *bfqq;

13606

+ 	struct request *next_rq;

13607

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

13608

+ 	int budg_timeout;

13609

+@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13610

+

13611

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");

13612

+

13613

+-	/*

13614

+-         * If another queue has a request waiting within our mean seek

13615

+-         * distance, let it run. The expire code will check for close

13616

+-         * cooperators and put the close queue at the front of the

13617

+-         * service tree. If possible, merge the expiring queue with the

13618

+-         * new bfqq.

13619

+-         */

13620

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

13621

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

13622

+-                bfq_setup_merge(bfqq, new_bfqq);

13623

+-

13624

+ 	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);

13625

+ 	if (budg_timeout &&

13626

+ 	    !bfq_bfqq_must_idle(bfqq, budg_timeout))

13627

+@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13628

+ 				bfq_clear_bfqq_wait_request(bfqq);

13629

+ 				del_timer(&bfqd->idle_slice_timer);

13630

+ 			}

13631

+-			if (new_bfqq == NULL)

13632

+-				goto keep_queue;

13633

+-			else

13634

+-				goto expire;

13635

++			goto keep_queue;

13636

+ 		}

13637

+ 	}

13638

+

13639

+@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

13640

+ 	 * queue still has requests in flight or is idling for a new request,

13641

+ 	 * then keep it.

13642

+ 	 */

13643

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

13644

++	if (timer_pending(&bfqd->idle_slice_timer) ||

13645

+ 	    (bfqq->dispatched != 0 &&

13646

+ 	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&

13647

+-	     !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {

13648

++	     !bfq_queue_nonrot_noidle(bfqd, bfqq))) {

13649

+ 		bfqq = NULL;

13650

+ 		goto keep_queue;

13651

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

13652

+-		/*

13653

+-		 * Expiring the queue because there is a close cooperator,

13654

+-		 * cancel timer.

13655

+-		 */

13656

+-		bfq_clear_bfqq_wait_request(bfqq);

13657

+-		del_timer(&bfqd->idle_slice_timer);

13658

+ 	}

13659

+

13660

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

13661

+ expire:

13662

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

13663

+ new_queue:

13664

+-	bfqq = bfq_set_active_queue(bfqd, new_bfqq);

13665

++	bfqq = bfq_set_active_queue(bfqd);

13666

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

13667

+ 		bfqq != NULL ? bfqq->pid : 0);

13668

+ keep_queue:

13669

+@@ -1617,9 +1807,8 @@ keep_queue:

13670

+

13671

+ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

13672

+ {

13673

++	struct bfq_entity *entity = &bfqq->entity;

13674

+ 	if (bfqq->raising_coeff > 1) { /* queue is being boosted */

13675

+-		struct bfq_entity *entity = &bfqq->entity;

13676

+-

13677

+ 		bfq_log_bfqq(bfqd, bfqq,

13678

+ 			"raising period dur %u/%u msec, "

13679

+ 			"old raising coeff %u, w %d(%d)",

13680

+@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

13681

+ 					     jiffies_to_msecs(bfqq->

13682

+ 						raising_cur_max_time));

13683

+ 				bfq_bfqq_end_raising(bfqq);

13684

+-				__bfq_entity_update_weight_prio(

13685

+-					bfq_entity_service_tree(entity),

13686

+-					entity);

13687

+ 			}

13688

+ 		}

13689

+ 	}

13690

++	/* Update weight both if it must be raised and if it must be lowered */

13691

++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))

13692

++		__bfq_entity_update_weight_prio(

13693

++			bfq_entity_service_tree(entity),

13694

++			entity);

13695

+ }

13696

+

13697

+ /*

13698

+@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)

13699

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

13700

+

13701

+ 	bic->ttime.last_end_request = jiffies;

13702

++	/*

13703

++	 * A newly created bic indicates that the process has just

13704

++	 * started doing I/O, and is probably mapping into memory its

13705

++	 * executable and libraries: it definitely needs weight raising.

13706

++	 * There is however the possibility that the process performs,

13707

++	 * for a while, I/O close to some other process. EQM intercepts

13708

++	 * this behavior and may merge the queue corresponding to the

13709

++	 * process  with some other queue, BEFORE the weight of the queue

13710

++	 * is raised. Merged queues are not weight-raised (they are assumed

13711

++	 * to belong to processes that benefit only from high throughput).

13712

++	 * If the merge is basically the consequence of an accident, then

13713

++	 * the queue will be split soon and will get back its old weight.

13714

++	 * It is then important to write down somewhere that this queue

13715

++	 * does need weight raising, even if it did not make it to get its

13716

++	 * weight raised before being merged. To this purpose, we overload

13717

++	 * the field raising_time_left and assign 1 to it, to mark the queue

13718

++	 * as needing weight raising.

13719

++	 */

13720

++	bic->raising_time_left = 1;

13721

+ }

13722

+

13723

+ static void bfq_exit_icq(struct io_cq *icq)

13724

+@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)

13725

+ 	}

13726

+

13727

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

13728

++		/*

13729

++		 * If the bic is using a shared queue, put the reference

13730

++		 * taken on the io_context when the bic started using a

13731

++		 * shared bfq_queue.

13732

++		 */

13733

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

13734

++			put_io_context(icq->ioc);

13735

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

13736

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

13737

+ 	}

13738

+@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

13739

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

13740

+ 		return;

13741

+

13742

++	/* Idle window just restored, statistics are meaningless. */

13743

++	if (bfq_bfqq_just_split(bfqq))

13744

++		return;

13745

++

13746

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

13747

+

13748

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

13749

+@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13750

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

13751

+ 	    !BFQQ_SEEKY(bfqq))

13752

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

13753

++	bfq_clear_bfqq_just_split(bfqq);

13754

+

13755

+ 	bfq_log_bfqq(bfqd, bfqq,

13756

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

13757

+@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

13758

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

13759

+ {

13760

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

13761

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

13762

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

13763

+

13764

+ 	assert_spin_locked(bfqd->queue->queue_lock);

13765

++

13766

++	/*

13767

++	 * An unplug may trigger a requeue of a request from the device

13768

++	 * driver: make sure we are in process context while trying to

13769

++	 * merge two bfq_queues.

13770

++	 */

13771

++	if (!in_interrupt() &&

13772

++	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {

13773

++		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

13774

++			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

13775

++		/*

13776

++		 * Release the request's reference to the old bfqq

13777

++		 * and make sure one is taken to the shared queue.

13778

++		 */

13779

++		new_bfqq->allocated[rq_data_dir(rq)]++;

13780

++		bfqq->allocated[rq_data_dir(rq)]--;

13781

++		atomic_inc(&new_bfqq->ref);

13782

++		bfq_put_queue(bfqq);

13783

++		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

13784

++			bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);

13785

++		rq->elv.priv[1] = new_bfqq;

13786

++		bfqq = new_bfqq;

13787

++	}

13788

++

13789

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

13790

+

13791

+ 	bfq_add_rq_rb(rq);

13792

+

13793

++	/*

13794

++	 * Here a newly-created bfq_queue has already started a weight-raising

13795

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

13796

++	 * from assigning it a full weight-raising period. See the detailed

13797

++	 * comments about this field in bfq_init_icq().

13798

++	 */

13799

++	if (bfqq->bic != NULL)

13800

++		bfqq->bic->raising_time_left = 0;

13801

+ 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);

13802

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

13803

+

13804

+@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)

13805

+ 		if (bfq_bfqq_budget_new(bfqq))

13806

+ 			bfq_set_budget_timeout(bfqd);

13807

+

13808

+-		/* Idling is disabled also for cooperation issues:

13809

+-		 * 1) there is a close cooperator for the queue, or

13810

+-		 * 2) the queue is shared and some cooperator is likely

13811

+-		 *    to be idle (in this case, by not arming the idle timer,

13812

+-		 *    we try to slow down the queue, to prevent the zones

13813

+-		 *    of the disk accessed by the active cooperators to become

13814

+-		 *    too distant from the zone that will be accessed by the

13815

+-		 *    currently idle cooperators)

13816

+-		 */

13817

+ 		if (bfq_bfqq_must_idle(bfqq, budg_timeout))

13818

+ 			bfq_arm_slice_timer(bfqd);

13819

+ 		else if (budg_timeout)

13820

+@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)

13821

+ 	}

13822

+ }

13823

+

13824

+-static struct bfq_queue *

13825

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

13826

+-                struct bfq_queue *bfqq)

13827

+-{

13828

+-        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

13829

+-		(long unsigned)bfqq->new_bfqq->pid);

13830

+-        bic_set_bfqq(bic, bfqq->new_bfqq, 1);

13831

+-        bfq_mark_bfqq_coop(bfqq->new_bfqq);

13832

+-        bfq_put_queue(bfqq);

13833

+-        return bic_to_bfqq(bic, 1);

13834

+-}

13835

+-

13836

+ /*

13837

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

13838

+  * was the last process referring to said bfqq.

13839

+@@ -2469,6 +2702,9 @@ static struct bfq_queue *

13840

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

13841

+ {

13842

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

13843

++

13844

++	put_io_context(bic->icq.ioc);

13845

++

13846

+ 	if (bfqq_process_refs(bfqq) == 1) {

13847

+ 		bfqq->pid = current->pid;

13848

+ 		bfq_clear_bfqq_some_coop_idle(bfqq);

13849

+@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

13850

+ 	struct bfq_queue *bfqq;

13851

+ 	struct bfq_group *bfqg;

13852

+ 	unsigned long flags;

13853

++	bool split = false;

13854

+

13855

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

13856

+

13857

+@@ -2516,24 +2753,14 @@ new_queue:

13858

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

13859

+ 		bic_set_bfqq(bic, bfqq, is_sync);

13860

+ 	} else {

13861

+-		/*

13862

+-		 * If the queue was seeky for too long, break it apart.

13863

+-		 */

13864

++		/* If the queue was seeky for too long, break it apart. */

13865

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

13866

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

13867

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

13868

++			split = true;

13869

+ 			if (!bfqq)

13870

+ 				goto new_queue;

13871

+ 		}

13872

+-

13873

+-		/*

13874

+-		 * Check to see if this queue is scheduled to merge with

13875

+-		 * another closely cooperating queue. The merging of queues

13876

+-		 * happens here as it must be done in process context.

13877

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

13878

+-		 */

13879

+-		if (bfqq->new_bfqq != NULL)

13880

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

13881

+ 	}

13882

+

13883

+ 	bfqq->allocated[rw]++;

13884

+@@ -2544,6 +2771,26 @@ new_queue:

13885

+ 	rq->elv.priv[0] = bic;

13886

+ 	rq->elv.priv[1] = bfqq;

13887

+

13888

++	/*

13889

++	 * If a bfq_queue has only one process reference, it is owned

13890

++	 * by only one bfq_io_cq: we can set the bic field of the

13891

++	 * bfq_queue to the address of that structure. Also, if the

13892

++	 * queue has just been split, mark a flag so that the

13893

++	 * information is available to the other scheduler hooks.

13894

++	 */

13895

++	if (bfqq_process_refs(bfqq) == 1) {

13896

++		bfqq->bic = bic;

13897

++		if (split) {

13898

++			bfq_mark_bfqq_just_split(bfqq);

13899

++			/*

13900

++			 * If the queue has just been split from a shared queue,

13901

++			 * restore the idle window and the possible weight

13902

++			 * raising period.

13903

++			 */

13904

++			bfq_bfqq_resume_state(bfqq, bic);

13905

++		}

13906

++	}

13907

++

13908

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

13909

+

13910

+ 	return 0;

13911

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

13912

+index 03f8061..a0edaa2 100644

13913

+--- a/block/bfq-sched.c

13914

++++ b/block/bfq-sched.c

13915

+@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

13916

+ 	return bfqq;

13917

+ }

13918

+

13919

+-/*

13920

+- * Forced extraction of the given queue.

13921

+- */

13922

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

13923

+-				      struct bfq_queue *bfqq)

13924

+-{

13925

+-	struct bfq_entity *entity;

13926

+-	struct bfq_sched_data *sd;

13927

+-

13928

+-	BUG_ON(bfqd->active_queue != NULL);

13929

+-

13930

+-	entity = &bfqq->entity;

13931

+-	/*

13932

+-	 * Bubble up extraction/update from the leaf to the root.

13933

+-	*/

13934

+-	for_each_entity(entity) {

13935

+-		sd = entity->sched_data;

13936

+-		bfq_update_budget(entity);

13937

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

13938

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

13939

+-		sd->active_entity = entity;

13940

+-		sd->next_active = NULL;

13941

+-		entity->service = 0;

13942

+-	}

13943

+-

13944

+-	return;

13945

+-}

13946

+-

13947

+ static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)

13948

+ {

13949

+ 	if (bfqd->active_bic != NULL) {

13950

+diff --git a/block/bfq.h b/block/bfq.h

13951

+index 48ecde9..bb52975 100644

13952

+--- a/block/bfq.h

13953

++++ b/block/bfq.h

13954

+@@ -188,6 +188,8 @@ struct bfq_group;

13955

+  * @pid: pid of the process owning the queue, used for logging purposes.

13956

+  * @last_rais_start_time: last (idle -> weight-raised) transition attempt

13957

+  * @raising_cur_max_time: current max raising time for this queue

13958

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

13959

++ *	 queue is shared

13960

+  *

13961

+  * A bfq_queue is a leaf request queue; it can be associated to an io_context

13962

+  * or more (if it is an async one).  @cgroup holds a reference to the

13963

+@@ -231,6 +233,7 @@ struct bfq_queue {

13964

+ 	sector_t last_request_pos;

13965

+

13966

+ 	pid_t pid;

13967

++	struct bfq_io_cq *bic;

13968

+

13969

+ 	/* weight-raising fields */

13970

+ 	unsigned int raising_cur_max_time;

13971

+@@ -257,12 +260,23 @@ struct bfq_ttime {

13972

+  * @icq: associated io_cq structure

13973

+  * @bfqq: array of two process queues, the sync and the async

13974

+  * @ttime: associated @bfq_ttime struct

13975

++ * @raising_time_left: snapshot of the time left before weight raising ends

13976

++ *		       for the sync queue associated to this process; this

13977

++ *		       snapshot is taken to remember this value while the weight

13978

++ *		       raising is suspended because the queue is merged with a

13979

++ *		       shared queue, and is used to set @raising_cur_max_time

13980

++ *		       when the queue is split from the shared queue and its

13981

++ *		       weight is raised again

13982

++ * @saved_idle_window: same purpose as the previous field for the idle window

13983

+  */

13984

+ struct bfq_io_cq {

13985

+ 	struct io_cq icq; /* must be the first member */

13986

+ 	struct bfq_queue *bfqq[2];

13987

+ 	struct bfq_ttime ttime;

13988

+ 	int ioprio;

13989

++

13990

++	unsigned int raising_time_left;

13991

++	unsigned int saved_idle_window;

13992

+ };

13993

+

13994

+ /**

13995

+@@ -403,6 +417,7 @@ enum bfqq_state_flags {

13996

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

13997

+ 	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

13998

+ 	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */

13999

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

14000

+ };

14001

+

14002

+ #define BFQ_BFQQ_FNS(name)						\

14003

+@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);

14004

+ BFQ_BFQQ_FNS(coop);

14005

+ BFQ_BFQQ_FNS(split_coop);

14006

+ BFQ_BFQQ_FNS(some_coop_idle);

14007

++BFQ_BFQQ_FNS(just_split);

14008

+ #undef BFQ_BFQQ_FNS

14009

+

14010

+ /* Logging facilities. */

14011

+--

14012

+1.8.1.4

14013

+

14014

14015

Added: genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

14016

===================================================================

14017

--- genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch	                        (rev 0)

14018

+++ genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch	2013-11-04 10:09:31 UTC (rev 2565)

14019

@@ -0,0 +1,362 @@

14020

+From 2e1646d06515b7dd1344db547dfcf9a4640dee8e Mon Sep 17 00:00:00 2001

14021

+From: Arianna Avanzini <avanzini.arianna@×××××.com>

14022

+Date: Wed, 11 Sep 2013 22:26:47 +0200

14023

+Subject: [PATCH] block: Switch from BFQ-v6r2 for 3.11.0 to BFQ-v6r2 for

14024

+ 3.12.0-rc1

14025

+

14026

+---

14027

+ block/bfq-cgroup.c | 115 +++++++++++++++++++++++++++++++----------------------

14028

+ block/bfq.h        |   2 +

14029

+ 2 files changed, 70 insertions(+), 47 deletions(-)

14030

+

14031

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

14032

+index bb9b851..afae4ca 100644

14033

+--- a/block/bfq-cgroup.c

14034

++++ b/block/bfq-cgroup.c

14035

+@@ -16,9 +16,9 @@

14036

+

14037

+ static DEFINE_MUTEX(bfqio_mutex);

14038

+

14039

+-static bool bfqio_is_removed(struct cgroup *cgroup)

14040

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

14041

+ {

14042

+-	return test_bit(CGRP_DEAD, &cgroup->flags);

14043

++	return bgrp ? !bgrp->online : false;

14044

+ }

14045

+

14046

+ static struct bfqio_cgroup bfqio_root_cgroup = {

14047

+@@ -38,10 +38,9 @@ static inline void bfq_init_entity(struct bfq_entity *entity,

14048

+ 	entity->sched_data = &bfqg->sched_data;

14049

+ }

14050

+

14051

+-static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)

14052

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

14053

+ {

14054

+-	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),

14055

+-			    struct bfqio_cgroup, css);

14056

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

14057

+ }

14058

+

14059

+ /*

14060

+@@ -103,20 +102,20 @@ static inline void bfq_group_set_parent(struct bfq_group *bfqg,

14061

+ /**

14062

+  * bfq_group_chain_alloc - allocate a chain of groups.

14063

+  * @bfqd: queue descriptor.

14064

+- * @cgroup: the leaf cgroup this chain starts from.

14065

++ * @css: the leaf cgroup_subsys_state this chain starts from.

14066

+  *

14067

+  * Allocate a chain of groups starting from the one belonging to

14068

+  * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

14069

+  * to the root has already an allocated group on @bfqd.

14070

+  */

14071

+ static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

14072

+-					       struct cgroup *cgroup)

14073

++					       struct cgroup_subsys_state *css)

14074

+ {

14075

+ 	struct bfqio_cgroup *bgrp;

14076

+ 	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

14077

+

14078

+-	for (; cgroup != NULL; cgroup = cgroup->parent) {

14079

+-		bgrp = cgroup_to_bfqio(cgroup);

14080

++	for (; css != NULL; css = css->parent) {

14081

++		bgrp = css_to_bfqio(css);

14082

+

14083

+ 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14084

+ 		if (bfqg != NULL) {

14085

+@@ -165,7 +164,7 @@ cleanup:

14086

+ /**

14087

+  * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.

14088

+  * @bfqd: the queue descriptor.

14089

+- * @cgroup: the leaf cgroup to start from.

14090

++ * @css: the leaf cgroup_subsys_state to start from.

14091

+  * @leaf: the leaf group (to be associated to @cgroup).

14092

+  *

14093

+  * Try to link a chain of groups to a cgroup hierarchy, connecting the

14094

+@@ -177,7 +176,8 @@ cleanup:

14095

+  * per device) while the bfqio_cgroup lock protects the list of groups

14096

+  * belonging to the same cgroup.

14097

+  */

14098

+-static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14099

++static void bfq_group_chain_link(struct bfq_data *bfqd,

14100

++				 struct cgroup_subsys_state *css,

14101

+ 				 struct bfq_group *leaf)

14102

+ {

14103

+ 	struct bfqio_cgroup *bgrp;

14104

+@@ -186,8 +186,8 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14105

+

14106

+ 	assert_spin_locked(bfqd->queue->queue_lock);

14107

+

14108

+-	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {

14109

+-		bgrp = cgroup_to_bfqio(cgroup);

14110

++	for (; css != NULL && leaf != NULL; css = css->parent) {

14111

++		bgrp = css_to_bfqio(css);

14112

+ 		next = leaf->bfqd;

14113

+

14114

+ 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14115

+@@ -205,9 +205,9 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14116

+ 		leaf = next;

14117

+ 	}

14118

+

14119

+-	BUG_ON(cgroup == NULL && leaf != NULL);

14120

+-	if (cgroup != NULL && prev != NULL) {

14121

+-		bgrp = cgroup_to_bfqio(cgroup);

14122

++	BUG_ON(css == NULL && leaf != NULL);

14123

++	if (css != NULL && prev != NULL) {

14124

++		bgrp = css_to_bfqio(css);

14125

+ 		bfqg = bfqio_lookup_group(bgrp, bfqd);

14126

+ 		bfq_group_set_parent(prev, bfqg);

14127

+ 	}

14128

+@@ -233,18 +233,18 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,

14129

+  * have been successful.

14130

+  */

14131

+ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

14132

+-					      struct cgroup *cgroup)

14133

++					      struct cgroup_subsys_state *css)

14134

+ {

14135

+-	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

14136

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14137

+ 	struct bfq_group *bfqg;

14138

+

14139

+ 	bfqg = bfqio_lookup_group(bgrp, bfqd);

14140

+ 	if (bfqg != NULL)

14141

+ 		return bfqg;

14142

+

14143

+-	bfqg = bfq_group_chain_alloc(bfqd, cgroup);

14144

++	bfqg = bfq_group_chain_alloc(bfqd, css);

14145

+ 	if (bfqg != NULL)

14146

+-		bfq_group_chain_link(bfqd, cgroup, bfqg);

14147

++		bfq_group_chain_link(bfqd, css, bfqg);

14148

+ 	else

14149

+ 		bfqg = bfqd->root_group;

14150

+

14151

+@@ -315,8 +315,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

14152

+  * time here, at the price of slightly more complex code.

14153

+  */

14154

+ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14155

+-						 struct bfq_io_cq *bic,

14156

+-						 struct cgroup *cgroup)

14157

++						struct bfq_io_cq *bic,

14158

++						struct cgroup_subsys_state *css)

14159

+ {

14160

+ 	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

14161

+ 	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

14162

+@@ -324,9 +324,9 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14163

+ 	struct bfq_group *bfqg;

14164

+ 	struct bfqio_cgroup *bgrp;

14165

+

14166

+-	bgrp = cgroup_to_bfqio(cgroup);

14167

++	bgrp = css_to_bfqio(css);

14168

+

14169

+-	bfqg = bfq_find_alloc_group(bfqd, cgroup);

14170

++	bfqg = bfq_find_alloc_group(bfqd, css);

14171

+ 	if (async_bfqq != NULL) {

14172

+ 		entity = &async_bfqq->entity;

14173

+

14174

+@@ -357,14 +357,14 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

14175

+  * moved into its new parent group.

14176

+  */

14177

+ static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

14178

+-				  struct cgroup *cgroup)

14179

++				  struct cgroup_subsys_state *css)

14180

+ {

14181

+ 	struct bfq_data *bfqd;

14182

+ 	unsigned long uninitialized_var(flags);

14183

+

14184

+ 	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);

14185

+ 	if (bfqd != NULL) {

14186

+-		__bfq_bic_change_cgroup(bfqd, bic, cgroup);

14187

++		__bfq_bic_change_cgroup(bfqd, bic, css);

14188

+ 		bfq_put_bfqd_unlock(bfqd, &flags);

14189

+ 	}

14190

+ }

14191

+@@ -394,13 +394,13 @@ static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

14192

+ {

14193

+ 	struct bfq_data *bfqd = bic_to_bfqd(bic);

14194

+ 	struct bfq_group *bfqg;

14195

+-	struct cgroup *cgroup;

14196

++	struct cgroup_subsys_state *css;

14197

+

14198

+ 	BUG_ON(bfqd == NULL);

14199

+

14200

+ 	rcu_read_lock();

14201

+-	cgroup = task_cgroup(current, bfqio_subsys_id);

14202

+-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);

14203

++	css = task_css(current, bfqio_subsys_id);

14204

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

14205

+ 	rcu_read_unlock();

14206

+

14207

+ 	return bfqg;

14208

+@@ -622,17 +622,16 @@ static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

14209

+ }

14210

+

14211

+ #define SHOW_FUNCTION(__VAR)						\

14212

+-static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\

14213

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

14214

+ 				       struct cftype *cftype)		\

14215

+ {									\

14216

+-	struct bfqio_cgroup *bgrp;					\

14217

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

14218

+ 	u64 ret = -ENODEV;						\

14219

+ 									\

14220

+ 	mutex_lock(&bfqio_mutex);					\

14221

+-	if (bfqio_is_removed(cgroup))					\

14222

++	if (bfqio_is_removed(bgrp))					\

14223

+ 		goto out_unlock;					\

14224

+ 									\

14225

+-	bgrp = cgroup_to_bfqio(cgroup);					\

14226

+ 	spin_lock_irq(&bgrp->lock);					\

14227

+ 	ret = bgrp->__VAR;						\

14228

+ 	spin_unlock_irq(&bgrp->lock);					\

14229

+@@ -648,11 +647,11 @@ SHOW_FUNCTION(ioprio_class);

14230

+ #undef SHOW_FUNCTION

14231

+

14232

+ #define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

14233

+-static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

14234

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

14235

+ 					struct cftype *cftype,		\

14236

+ 					u64 val)			\

14237

+ {									\

14238

+-	struct bfqio_cgroup *bgrp;					\

14239

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

14240

+ 	struct bfq_group *bfqg;						\

14241

+ 	int ret = -EINVAL;						\

14242

+ 									\

14243

+@@ -661,12 +660,10 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\

14244

+ 									\

14245

+ 	ret = -ENODEV;							\

14246

+ 	mutex_lock(&bfqio_mutex);					\

14247

+-	if (bfqio_is_removed(cgroup))					\

14248

++	if (bfqio_is_removed(bgrp))					\

14249

+ 		goto out_unlock;					\

14250

+ 	ret = 0;							\

14251

+ 									\

14252

+-	bgrp = cgroup_to_bfqio(cgroup);					\

14253

+-									\

14254

+ 	spin_lock_irq(&bgrp->lock);					\

14255

+ 	bgrp->__VAR = (unsigned short)val;				\

14256

+ 	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

14257

+@@ -713,11 +710,11 @@ static struct cftype bfqio_files[] = {

14258

+ 	{ },	/* terminate */

14259

+ };

14260

+

14261

+-static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

14262

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state *parent_css)

14263

+ {

14264

+ 	struct bfqio_cgroup *bgrp;

14265

+

14266

+-	if (cgroup->parent != NULL) {

14267

++	if (parent_css != NULL) {

14268

+ 		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

14269

+ 		if (bgrp == NULL)

14270

+ 			return ERR_PTR(-ENOMEM);

14271

+@@ -740,13 +737,14 @@ static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)

14272

+  * behavior is that a group containing a task that forked using CLONE_IO

14273

+  * will not be destroyed until the tasks sharing the ioc die.

14274

+  */

14275

+-static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14276

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

14277

++			    struct cgroup_taskset *tset)

14278

+ {

14279

+ 	struct task_struct *task;

14280

+ 	struct io_context *ioc;

14281

+ 	int ret = 0;

14282

+

14283

+-	cgroup_taskset_for_each(task, cgroup, tset) {

14284

++	cgroup_taskset_for_each(task, css, tset) {

14285

+ 		/* task_lock() is needed to avoid races with exit_io_context() */

14286

+ 		task_lock(task);

14287

+ 		ioc = task->io_context;

14288

+@@ -766,7 +764,8 @@ static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14289

+ 	return ret;

14290

+ }

14291

+

14292

+-static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14293

++static void bfqio_attach(struct cgroup_subsys_state *css,

14294

++			 struct cgroup_taskset *tset)

14295

+ {

14296

+ 	struct task_struct *task;

14297

+ 	struct io_context *ioc;

14298

+@@ -776,7 +775,7 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14299

+ 	 * IMPORTANT NOTE: The move of more than one process at a time to a

14300

+ 	 * new group has not yet been tested.

14301

+ 	 */

14302

+-	cgroup_taskset_for_each(task, cgroup, tset) {

14303

++	cgroup_taskset_for_each(task, css, tset) {

14304

+ 		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

14305

+ 		if (ioc) {

14306

+ 			/*

14307

+@@ -787,16 +786,16 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)

14308

+ 				if (!strncmp(icq->q->elevator->type->elevator_name,

14309

+ 					     "bfq", ELV_NAME_MAX))

14310

+ 					bfq_bic_change_cgroup(icq_to_bic(icq),

14311

+-							      cgroup);

14312

++							      css);

14313

+ 			rcu_read_unlock();

14314

+ 			put_io_context(ioc);

14315

+ 		}

14316

+ 	}

14317

+ }

14318

+

14319

+-static void bfqio_destroy(struct cgroup *cgroup)

14320

++static void bfqio_destroy(struct cgroup_subsys_state *css)

14321

+ {

14322

+-	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);

14323

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14324

+ 	struct hlist_node *tmp;

14325

+ 	struct bfq_group *bfqg;

14326

+

14327

+@@ -815,9 +814,31 @@ static void bfqio_destroy(struct cgroup *cgroup)

14328

+ 	kfree(bgrp);

14329

+ }

14330

+

14331

++static int bfqio_css_online(struct cgroup_subsys_state *css)

14332

++{

14333

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14334

++

14335

++	mutex_lock(&bfqio_mutex);

14336

++	bgrp->online = true;

14337

++	mutex_unlock(&bfqio_mutex);

14338

++

14339

++	return 0;

14340

++}

14341

++

14342

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

14343

++{

14344

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

14345

++

14346

++	mutex_lock(&bfqio_mutex);

14347

++	bgrp->online = false;

14348

++	mutex_unlock(&bfqio_mutex);

14349

++}

14350

++

14351

+ struct cgroup_subsys bfqio_subsys = {

14352

+ 	.name = "bfqio",

14353

+ 	.css_alloc = bfqio_create,

14354

++	.css_online = bfqio_css_online,

14355

++	.css_offline = bfqio_css_offline,

14356

+ 	.can_attach = bfqio_can_attach,

14357

+ 	.attach = bfqio_attach,

14358

+ 	.css_free = bfqio_destroy,

14359

+diff --git a/block/bfq.h b/block/bfq.h

14360

+index bb52975..885e62c 100644

14361

+--- a/block/bfq.h

14362

++++ b/block/bfq.h

14363

+@@ -510,6 +510,7 @@ struct bfq_group {

14364

+ /**

14365

+  * struct bfqio_cgroup - bfq cgroup data structure.

14366

+  * @css: subsystem state for bfq in the containing cgroup.

14367

++ * @online: flag marked when the subsystem is inserted.

14368

+  * @weight: cgroup weight.

14369

+  * @ioprio: cgroup ioprio.

14370

+  * @ioprio_class: cgroup ioprio_class.

14371

+@@ -521,6 +522,7 @@ struct bfq_group {

14372

+  */

14373

+ struct bfqio_cgroup {

14374

+ 	struct cgroup_subsys_state css;

14375

++	bool online;

14376

+

14377

+ 	unsigned short weight, ioprio, ioprio_class;

14378

+

14379

+--

14380

+1.8.1.4

14381

+

14382

14383

14384

Property changes on: genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch

14385

___________________________________________________________________

14386

Added: svn:executable

14387

## -0,0 +1 ##

14388

+*

14389

\ No newline at end of property

Gentoo Archives: gentoo-commits