Gentoo Archives: gentoo-commits

From: "Mike Pagano (mpagano)" <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2508 - genpatches-2.6/trunk/3.11
Date: Mon, 02 Sep 2013 23:11:06
Message-Id: 20130902231055.D013F2004C@flycatcher.gentoo.org
1 Author: mpagano
2 Date: 2013-09-02 23:10:55 +0000 (Mon, 02 Sep 2013)
3 New Revision: 2508
4
5 Removed:
6 genpatches-2.6/trunk/3.11/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.10.patch
7 genpatches-2.6/trunk/3.11/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.10.patch1
8 genpatches-2.6/trunk/3.11/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.10.0.patch1
9 Modified:
10 genpatches-2.6/trunk/3.11/0000_README
11 Log:
12 Remove BFQ patches, waiting on updated patchset
13
14 Modified: genpatches-2.6/trunk/3.11/0000_README
15 ===================================================================
16 --- genpatches-2.6/trunk/3.11/0000_README 2013-09-02 23:07:59 UTC (rev 2507)
17 +++ genpatches-2.6/trunk/3.11/0000_README 2013-09-02 23:10:55 UTC (rev 2508)
18 @@ -47,22 +47,6 @@
19 From: https://bugs.gentoo.org/show_bug.cgi?id=449248
20 Desc: Enable mic mute led in thinkpads
21
22 -Patch: 1800_memcg-OOM-revert-ZFS-deadlock.patch
23 -From: https://bugs.gentoo.org/show_bug.cgi?id=462066
24 -Desc: Revert memcg patches that prevent OOM with too many dirty pages.
25 -
26 -Patch: 1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.10.patch
27 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
28 -Desc: BFQ v6r2 patch 1 for 3.10: Build, cgroups and kconfig bits
29 -
30 -Patch: 1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.10.patch1
31 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
32 -Desc: BFQ v6r2 patch 2 for 3.10: BFQ Scheduler
33 -
34 -Patch: 1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.10.0.patch1
35 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
36 -Desc: BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM)
37 -
38 Patch: 2400_kcopy-patch-for-infiniband-driver.patch
39 From: Alexey Shvetsov <alexxy@g.o>
40 Desc: Zero copy for infiniband psm userspace driver
41
42 Deleted: genpatches-2.6/trunk/3.11/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.10.patch
43 ===================================================================
44 --- genpatches-2.6/trunk/3.11/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.10.patch 2013-09-02 23:07:59 UTC (rev 2507)
45 +++ genpatches-2.6/trunk/3.11/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.10.patch 2013-09-02 23:10:55 UTC (rev 2508)
46 @@ -1,97 +0,0 @@
47 -From 13fa5ddac2963e304e90c5beb4bc996e3557479d Mon Sep 17 00:00:00 2001
48 -From: Matteo Bernardini <matteo.bernardini@×××××.com>
49 -Date: Thu, 9 May 2013 18:58:50 +0200
50 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.10
51 -
52 -Update Kconfig.iosched and do the related Makefile changes to include
53 -kernel configuration options for BFQ. Also add the bfqio controller
54 -to the cgroups subsystem.
55 -
56 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
57 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
58 -Signed-off-by: Matteo Bernardini <matteo.bernardini@×××××.com>
59 ----
60 - block/Kconfig.iosched | 25 +++++++++++++++++++++++++
61 - block/Makefile | 1 +
62 - include/linux/cgroup_subsys.h | 6 ++++++
63 - 3 files changed, 32 insertions(+)
64 -
65 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
66 -index 421bef9..695e064 100644
67 ---- a/block/Kconfig.iosched
68 -+++ b/block/Kconfig.iosched
69 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
70 - ---help---
71 - Enable group IO scheduling in CFQ.
72 -
73 -+config IOSCHED_BFQ
74 -+ tristate "BFQ I/O scheduler"
75 -+ default n
76 -+ ---help---
77 -+ The BFQ I/O scheduler tries to distribute bandwidth among
78 -+ all processes according to their weights.
79 -+ It aims at distributing the bandwidth as desired, independently of
80 -+ the disk parameters and with any workload. It also tries to
81 -+ guarantee low latency to interactive and soft real-time
82 -+ applications. If compiled built-in (saying Y here), BFQ can
83 -+ be configured to support hierarchical scheduling.
84 -+
85 -+config CGROUP_BFQIO
86 -+ bool "BFQ hierarchical scheduling support"
87 -+ depends on CGROUPS && IOSCHED_BFQ=y
88 -+ default n
89 -+ ---help---
90 -+ Enable hierarchical scheduling in BFQ, using the cgroups
91 -+ filesystem interface. The name of the subsystem will be
92 -+ bfqio.
93 -+
94 - choice
95 - prompt "Default I/O scheduler"
96 - default DEFAULT_CFQ
97 -@@ -52,6 +73,9 @@ choice
98 - config DEFAULT_CFQ
99 - bool "CFQ" if IOSCHED_CFQ=y
100 -
101 -+ config DEFAULT_BFQ
102 -+ bool "BFQ" if IOSCHED_BFQ=y
103 -+
104 - config DEFAULT_NOOP
105 - bool "No-op"
106 -
107 -@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED
108 - string
109 - default "deadline" if DEFAULT_DEADLINE
110 - default "cfq" if DEFAULT_CFQ
111 -+ default "bfq" if DEFAULT_BFQ
112 - default "noop" if DEFAULT_NOOP
113 -
114 - endmenu
115 -diff --git a/block/Makefile b/block/Makefile
116 -index 39b76ba..c0d20fa 100644
117 ---- a/block/Makefile
118 -+++ b/block/Makefile
119 -@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
120 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
121 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
122 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
123 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
124 -
125 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
126 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
127 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
128 -index 6e7ec64..ffa1d1f 100644
129 ---- a/include/linux/cgroup_subsys.h
130 -+++ b/include/linux/cgroup_subsys.h
131 -@@ -84,3 +84,9 @@ SUBSYS(bcache)
132 - #endif
133 -
134 - /* */
135 -+
136 -+#ifdef CONFIG_CGROUP_BFQIO
137 -+SUBSYS(bfqio)
138 -+#endif
139 -+
140 -+/* */
141 ---
142 -1.8.1.4
143 -
144
145 Deleted: genpatches-2.6/trunk/3.11/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.10.patch1
146 ===================================================================
147 --- genpatches-2.6/trunk/3.11/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.10.patch1 2013-09-02 23:07:59 UTC (rev 2507)
148 +++ genpatches-2.6/trunk/3.11/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.10.patch1 2013-09-02 23:10:55 UTC (rev 2508)
149 @@ -1,5775 +0,0 @@
150 -From 2e949c3d4d8ba2af46dcedc80707ebba277d759f Mon Sep 17 00:00:00 2001
151 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
152 -Date: Thu, 9 May 2013 19:10:02 +0200
153 -Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.10
154 -
155 -Add the BFQ-v6r2 I/O scheduler to 3.10.
156 -The general structure is borrowed from CFQ, as much code. A (bfq_)queue
157 -is associated to each task doing I/O on a device, and each time a
158 -scheduling decision has to be made a queue is selected and served until
159 -it expires.
160 -
161 - - Slices are given in the service domain: tasks are assigned
162 - budgets, measured in number of sectors. Once got the disk, a task
163 - must however consume its assigned budget within a configurable
164 - maximum time (by default, the maximum possible value of the
165 - budgets is automatically computed to comply with this timeout).
166 - This allows the desired latency vs "throughput boosting" tradeoff
167 - to be set.
168 -
169 - - Budgets are scheduled according to a variant of WF2Q+, implemented
170 - using an augmented rb-tree to take eligibility into account while
171 - preserving an O(log N) overall complexity.
172 -
173 - - A low-latency tunable is provided; if enabled, both interactive
174 - and soft real-time applications are guaranteed very low latency.
175 -
176 - - Latency guarantees are preserved also in presence of NCQ.
177 -
178 - - Also with flash-based devices, a high throughput is achieved while
179 - still preserving latency guarantees.
180 -
181 - - Useful features borrowed from CFQ: cooperating-queues merging (with
182 - some additional optimizations with respect to the original CFQ version),
183 - static fallback queue for OOM.
184 -
185 - - BFQ supports full hierarchical scheduling, exporting a cgroups
186 - interface. Each node has a full scheduler, so each group can
187 - be assigned its own ioprio (mapped to a weight, see next point)
188 - and an ioprio_class.
189 -
190 - - If the cgroups interface is used, weights can be explictly
191 - assigned, otherwise ioprio values are mapped to weights using the
192 - relation weight = IOPRIO_BE_NR - ioprio.
193 -
194 - - ioprio classes are served in strict priority order, i.e., lower
195 - priority queues are not served as long as there are higher
196 - priority queues. Among queues in the same class the bandwidth is
197 - distributed in proportion to the weight of each queue. A very
198 - thin extra bandwidth is however guaranteed to the Idle class, to
199 - prevent it from starving.
200 -
201 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
202 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
203 ----
204 - block/bfq-cgroup.c | 881 ++++++++++++
205 - block/bfq-ioc.c | 36 +
206 - block/bfq-iosched.c | 3070 +++++++++++++++++++++++++++++++++++++++++
207 - block/bfq-sched.c | 1072 ++++++++++++++
208 - block/bfq.h | 603 ++++++++
209 - include/linux/cgroup_subsys.h | 2 +-
210 - 6 files changed, 5663 insertions(+), 1 deletion(-)
211 - create mode 100644 block/bfq-cgroup.c
212 - create mode 100644 block/bfq-ioc.c
213 - create mode 100644 block/bfq-iosched.c
214 - create mode 100644 block/bfq-sched.c
215 - create mode 100644 block/bfq.h
216 -
217 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
218 -new file mode 100644
219 -index 0000000..6d57239
220 ---- /dev/null
221 -+++ b/block/bfq-cgroup.c
222 -@@ -0,0 +1,881 @@
223 -+/*
224 -+ * BFQ: CGROUPS support.
225 -+ *
226 -+ * Based on ideas and code from CFQ:
227 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
228 -+ *
229 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
230 -+ * Paolo Valente <paolo.valente@×××××××.it>
231 -+ *
232 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
233 -+ *
234 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
235 -+ */
236 -+
237 -+#ifdef CONFIG_CGROUP_BFQIO
238 -+
239 -+static DEFINE_MUTEX(bfqio_mutex);
240 -+
241 -+static bool bfqio_is_removed(struct cgroup *cgroup)
242 -+{
243 -+ return test_bit(CGRP_REMOVED, &cgroup->flags);
244 -+}
245 -+
246 -+static struct bfqio_cgroup bfqio_root_cgroup = {
247 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
248 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
249 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
250 -+};
251 -+
252 -+static inline void bfq_init_entity(struct bfq_entity *entity,
253 -+ struct bfq_group *bfqg)
254 -+{
255 -+ entity->weight = entity->new_weight;
256 -+ entity->orig_weight = entity->new_weight;
257 -+ entity->ioprio = entity->new_ioprio;
258 -+ entity->ioprio_class = entity->new_ioprio_class;
259 -+ entity->parent = bfqg->my_entity;
260 -+ entity->sched_data = &bfqg->sched_data;
261 -+}
262 -+
263 -+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
264 -+{
265 -+ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
266 -+ struct bfqio_cgroup, css);
267 -+}
268 -+
269 -+/*
270 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
271 -+ * of bgrp. Must be called under rcu_read_lock().
272 -+ */
273 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
274 -+ struct bfq_data *bfqd)
275 -+{
276 -+ struct bfq_group *bfqg;
277 -+ void *key;
278 -+
279 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
280 -+ key = rcu_dereference(bfqg->bfqd);
281 -+ if (key == bfqd)
282 -+ return bfqg;
283 -+ }
284 -+
285 -+ return NULL;
286 -+}
287 -+
288 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
289 -+ struct bfq_group *bfqg)
290 -+{
291 -+ struct bfq_entity *entity = &bfqg->entity;
292 -+
293 -+ /*
294 -+ * If the weight of the entity has never been set via the sysfs
295 -+ * interface, then bgrp->weight == 0. In this case we initialize
296 -+ * the weight from the current ioprio value. Otherwise, the group
297 -+ * weight, if set, has priority over the ioprio value.
298 -+ */
299 -+ if (bgrp->weight == 0) {
300 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
301 -+ entity->new_ioprio = bgrp->ioprio;
302 -+ } else {
303 -+ entity->new_weight = bgrp->weight;
304 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
305 -+ }
306 -+ entity->orig_weight = entity->weight = entity->new_weight;
307 -+ entity->ioprio = entity->new_ioprio;
308 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
309 -+ entity->my_sched_data = &bfqg->sched_data;
310 -+}
311 -+
312 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
313 -+ struct bfq_group *parent)
314 -+{
315 -+ struct bfq_entity *entity;
316 -+
317 -+ BUG_ON(parent == NULL);
318 -+ BUG_ON(bfqg == NULL);
319 -+
320 -+ entity = &bfqg->entity;
321 -+ entity->parent = parent->my_entity;
322 -+ entity->sched_data = &parent->sched_data;
323 -+}
324 -+
325 -+/**
326 -+ * bfq_group_chain_alloc - allocate a chain of groups.
327 -+ * @bfqd: queue descriptor.
328 -+ * @cgroup: the leaf cgroup this chain starts from.
329 -+ *
330 -+ * Allocate a chain of groups starting from the one belonging to
331 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
332 -+ * to the root has already an allocated group on @bfqd.
333 -+ */
334 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
335 -+ struct cgroup *cgroup)
336 -+{
337 -+ struct bfqio_cgroup *bgrp;
338 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
339 -+
340 -+ for (; cgroup != NULL; cgroup = cgroup->parent) {
341 -+ bgrp = cgroup_to_bfqio(cgroup);
342 -+
343 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
344 -+ if (bfqg != NULL) {
345 -+ /*
346 -+ * All the cgroups in the path from there to the
347 -+ * root must have a bfq_group for bfqd, so we don't
348 -+ * need any more allocations.
349 -+ */
350 -+ break;
351 -+ }
352 -+
353 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
354 -+ if (bfqg == NULL)
355 -+ goto cleanup;
356 -+
357 -+ bfq_group_init_entity(bgrp, bfqg);
358 -+ bfqg->my_entity = &bfqg->entity;
359 -+
360 -+ if (leaf == NULL) {
361 -+ leaf = bfqg;
362 -+ prev = leaf;
363 -+ } else {
364 -+ bfq_group_set_parent(prev, bfqg);
365 -+ /*
366 -+ * Build a list of allocated nodes using the bfqd
367 -+ * filed, that is still unused and will be initialized
368 -+ * only after the node will be connected.
369 -+ */
370 -+ prev->bfqd = bfqg;
371 -+ prev = bfqg;
372 -+ }
373 -+ }
374 -+
375 -+ return leaf;
376 -+
377 -+cleanup:
378 -+ while (leaf != NULL) {
379 -+ prev = leaf;
380 -+ leaf = leaf->bfqd;
381 -+ kfree(prev);
382 -+ }
383 -+
384 -+ return NULL;
385 -+}
386 -+
387 -+/**
388 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
389 -+ * @bfqd: the queue descriptor.
390 -+ * @cgroup: the leaf cgroup to start from.
391 -+ * @leaf: the leaf group (to be associated to @cgroup).
392 -+ *
393 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
394 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
395 -+ * hierarchy that already as a group associated to @bfqd all the nodes
396 -+ * in the path to the root cgroup have one too.
397 -+ *
398 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
399 -+ * per device) while the bfqio_cgroup lock protects the list of groups
400 -+ * belonging to the same cgroup.
401 -+ */
402 -+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
403 -+ struct bfq_group *leaf)
404 -+{
405 -+ struct bfqio_cgroup *bgrp;
406 -+ struct bfq_group *bfqg, *next, *prev = NULL;
407 -+ unsigned long flags;
408 -+
409 -+ assert_spin_locked(bfqd->queue->queue_lock);
410 -+
411 -+ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
412 -+ bgrp = cgroup_to_bfqio(cgroup);
413 -+ next = leaf->bfqd;
414 -+
415 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
416 -+ BUG_ON(bfqg != NULL);
417 -+
418 -+ spin_lock_irqsave(&bgrp->lock, flags);
419 -+
420 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
421 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
422 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
423 -+
424 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
425 -+
426 -+ prev = leaf;
427 -+ leaf = next;
428 -+ }
429 -+
430 -+ BUG_ON(cgroup == NULL && leaf != NULL);
431 -+ if (cgroup != NULL && prev != NULL) {
432 -+ bgrp = cgroup_to_bfqio(cgroup);
433 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
434 -+ bfq_group_set_parent(prev, bfqg);
435 -+ }
436 -+}
437 -+
438 -+/**
439 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
440 -+ * @bfqd: queue descriptor.
441 -+ * @cgroup: cgroup being searched for.
442 -+ *
443 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
444 -+ * necessary. When a group is returned all the cgroups in the path
445 -+ * to the root have a group associated to @bfqd.
446 -+ *
447 -+ * If the allocation fails, return the root group: this breaks guarantees
448 -+ * but is a safe fallbak. If this loss becames a problem it can be
449 -+ * mitigated using the equivalent weight (given by the product of the
450 -+ * weights of the groups in the path from @group to the root) in the
451 -+ * root scheduler.
452 -+ *
453 -+ * We allocate all the missing nodes in the path from the leaf cgroup
454 -+ * to the root and we connect the nodes only after all the allocations
455 -+ * have been successful.
456 -+ */
457 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
458 -+ struct cgroup *cgroup)
459 -+{
460 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
461 -+ struct bfq_group *bfqg;
462 -+
463 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
464 -+ if (bfqg != NULL)
465 -+ return bfqg;
466 -+
467 -+ bfqg = bfq_group_chain_alloc(bfqd, cgroup);
468 -+ if (bfqg != NULL)
469 -+ bfq_group_chain_link(bfqd, cgroup, bfqg);
470 -+ else
471 -+ bfqg = bfqd->root_group;
472 -+
473 -+ return bfqg;
474 -+}
475 -+
476 -+/**
477 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
478 -+ * @bfqd: queue descriptor.
479 -+ * @bfqq: the queue to move.
480 -+ * @entity: @bfqq's entity.
481 -+ * @bfqg: the group to move to.
482 -+ *
483 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
484 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
485 -+ *
486 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
487 -+ * not disappear (by now this just means that we are called under
488 -+ * rcu_read_lock()).
489 -+ */
490 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
491 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
492 -+{
493 -+ int busy, resume;
494 -+
495 -+ busy = bfq_bfqq_busy(bfqq);
496 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
497 -+
498 -+ BUG_ON(resume && !entity->on_st);
499 -+ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
500 -+
501 -+ if (busy) {
502 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
503 -+
504 -+ if (!resume)
505 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
506 -+ else
507 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
508 -+ } else if (entity->on_st)
509 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
510 -+
511 -+ /*
512 -+ * Here we use a reference to bfqg. We don't need a refcounter
513 -+ * as the cgroup reference will not be dropped, so that its
514 -+ * destroy() callback will not be invoked.
515 -+ */
516 -+ entity->parent = bfqg->my_entity;
517 -+ entity->sched_data = &bfqg->sched_data;
518 -+
519 -+ if (busy && resume)
520 -+ bfq_activate_bfqq(bfqd, bfqq);
521 -+
522 -+ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
523 -+ bfq_schedule_dispatch(bfqd);
524 -+}
525 -+
526 -+/**
527 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
528 -+ * @bfqd: the queue descriptor.
529 -+ * @bic: the bic to move.
530 -+ * @cgroup: the cgroup to move to.
531 -+ *
532 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
533 -+ * has to make sure that the reference to cgroup is valid across the call.
534 -+ *
535 -+ * NOTE: an alternative approach might have been to store the current
536 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
537 -+ * time here, at the price of slightly more complex code.
538 -+ */
539 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
540 -+ struct bfq_io_cq *bic,
541 -+ struct cgroup *cgroup)
542 -+{
543 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
544 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
545 -+ struct bfq_entity *entity;
546 -+ struct bfq_group *bfqg;
547 -+ struct bfqio_cgroup *bgrp;
548 -+
549 -+ bgrp = cgroup_to_bfqio(cgroup);
550 -+
551 -+ bfqg = bfq_find_alloc_group(bfqd, cgroup);
552 -+ if (async_bfqq != NULL) {
553 -+ entity = &async_bfqq->entity;
554 -+
555 -+ if (entity->sched_data != &bfqg->sched_data) {
556 -+ bic_set_bfqq(bic, NULL, 0);
557 -+ bfq_log_bfqq(bfqd, async_bfqq,
558 -+ "bic_change_group: %p %d",
559 -+ async_bfqq, atomic_read(&async_bfqq->ref));
560 -+ bfq_put_queue(async_bfqq);
561 -+ }
562 -+ }
563 -+
564 -+ if (sync_bfqq != NULL) {
565 -+ entity = &sync_bfqq->entity;
566 -+ if (entity->sched_data != &bfqg->sched_data)
567 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
568 -+ }
569 -+
570 -+ return bfqg;
571 -+}
572 -+
573 -+/**
574 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
575 -+ * @bic: the bic being migrated.
576 -+ * @cgroup: the destination cgroup.
577 -+ *
578 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
579 -+ * moved into its new parent group.
580 -+ */
581 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
582 -+ struct cgroup *cgroup)
583 -+{
584 -+ struct bfq_data *bfqd;
585 -+ unsigned long uninitialized_var(flags);
586 -+
587 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
588 -+ if (bfqd != NULL) {
589 -+ __bfq_bic_change_cgroup(bfqd, bic, cgroup);
590 -+ bfq_put_bfqd_unlock(bfqd, &flags);
591 -+ }
592 -+}
593 -+
594 -+/**
595 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
596 -+ * @bic: the @bic to update.
597 -+ *
598 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
599 -+ * We need this in addition to moving bics during the cgroup attach
600 -+ * phase because the task owning @bic could be at its first disk
601 -+ * access or we may end up in the root cgroup as the result of a
602 -+ * memory allocation failure and here we try to move to the right
603 -+ * group.
604 -+ *
605 -+ * Must be called under the queue lock. It is safe to use the returned
606 -+ * value even after the rcu_read_unlock() as the migration/destruction
607 -+ * paths act under the queue lock too. IOW it is impossible to race with
608 -+ * group migration/destruction and end up with an invalid group as:
609 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
610 -+ * has started execution, as current holds a reference to it,
611 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
612 -+ * migrated to a different cgroup] its attach() callback will have
613 -+ * taken care of remove all the references to the old cgroup data.
614 -+ */
615 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
616 -+{
617 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
618 -+ struct bfq_group *bfqg;
619 -+ struct cgroup *cgroup;
620 -+
621 -+ BUG_ON(bfqd == NULL);
622 -+
623 -+ rcu_read_lock();
624 -+ cgroup = task_cgroup(current, bfqio_subsys_id);
625 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
626 -+ rcu_read_unlock();
627 -+
628 -+ return bfqg;
629 -+}
630 -+
631 -+/**
632 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
633 -+ * @st: the service tree being flushed.
634 -+ */
635 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
636 -+{
637 -+ struct bfq_entity *entity = st->first_idle;
638 -+
639 -+ for (; entity != NULL; entity = st->first_idle)
640 -+ __bfq_deactivate_entity(entity, 0);
641 -+}
642 -+
643 -+/**
644 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
645 -+ * @bfqd: the device data structure with the root group.
646 -+ * @entity: the entity to move.
647 -+ */
648 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
649 -+ struct bfq_entity *entity)
650 -+{
651 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
652 -+
653 -+ BUG_ON(bfqq == NULL);
654 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
655 -+ return;
656 -+}
657 -+
658 -+/**
659 -+ * bfq_reparent_active_entities - move to the root group all active entities.
660 -+ * @bfqd: the device data structure with the root group.
661 -+ * @bfqg: the group to move from.
662 -+ * @st: the service tree with the entities.
663 -+ *
664 -+ * Needs queue_lock to be taken and reference to be valid over the call.
665 -+ */
666 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
667 -+ struct bfq_group *bfqg,
668 -+ struct bfq_service_tree *st)
669 -+{
670 -+ struct rb_root *active = &st->active;
671 -+ struct bfq_entity *entity = NULL;
672 -+
673 -+ if (!RB_EMPTY_ROOT(&st->active))
674 -+ entity = bfq_entity_of(rb_first(active));
675 -+
676 -+ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
677 -+ bfq_reparent_leaf_entity(bfqd, entity);
678 -+
679 -+ if (bfqg->sched_data.active_entity != NULL)
680 -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
681 -+
682 -+ return;
683 -+}
684 -+
685 -+/**
686 -+ * bfq_destroy_group - destroy @bfqg.
687 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
688 -+ * @bfqg: the group being destroyed.
689 -+ *
690 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
691 -+ */
692 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
693 -+{
694 -+ struct bfq_data *bfqd;
695 -+ struct bfq_service_tree *st;
696 -+ struct bfq_entity *entity = bfqg->my_entity;
697 -+ unsigned long uninitialized_var(flags);
698 -+ int i;
699 -+
700 -+ hlist_del(&bfqg->group_node);
701 -+
702 -+ /*
703 -+ * Empty all service_trees belonging to this group before deactivating
704 -+ * the group itself.
705 -+ */
706 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
707 -+ st = bfqg->sched_data.service_tree + i;
708 -+
709 -+ /*
710 -+ * The idle tree may still contain bfq_queues belonging
711 -+ * to exited task because they never migrated to a different
712 -+ * cgroup from the one being destroyed now. Noone else
713 -+ * can access them so it's safe to act without any lock.
714 -+ */
715 -+ bfq_flush_idle_tree(st);
716 -+
717 -+ /*
718 -+ * It may happen that some queues are still active
719 -+ * (busy) upon group destruction (if the corresponding
720 -+ * processes have been forced to terminate). We move
721 -+ * all the leaf entities corresponding to these queues
722 -+ * to the root_group.
723 -+ * Also, it may happen that the group has an entity
724 -+ * under service, which is disconnected from the active
725 -+ * tree: it must be moved, too.
726 -+ * There is no need to put the sync queues, as the
727 -+ * scheduler has taken no reference.
728 -+ */
729 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
730 -+ if (bfqd != NULL) {
731 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
732 -+ bfq_put_bfqd_unlock(bfqd, &flags);
733 -+ }
734 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
735 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
736 -+ }
737 -+ BUG_ON(bfqg->sched_data.next_active != NULL);
738 -+ BUG_ON(bfqg->sched_data.active_entity != NULL);
739 -+
740 -+ /*
741 -+ * We may race with device destruction, take extra care when
742 -+ * dereferencing bfqg->bfqd.
743 -+ */
744 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
745 -+ if (bfqd != NULL) {
746 -+ hlist_del(&bfqg->bfqd_node);
747 -+ __bfq_deactivate_entity(entity, 0);
748 -+ bfq_put_async_queues(bfqd, bfqg);
749 -+ bfq_put_bfqd_unlock(bfqd, &flags);
750 -+ }
751 -+ BUG_ON(entity->tree != NULL);
752 -+
753 -+ /*
754 -+ * No need to defer the kfree() to the end of the RCU grace
755 -+ * period: we are called from the destroy() callback of our
756 -+ * cgroup, so we can be sure that noone is a) still using
757 -+ * this cgroup or b) doing lookups in it.
758 -+ */
759 -+ kfree(bfqg);
760 -+}
761 -+
762 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
763 -+{
764 -+ struct hlist_node *tmp;
765 -+ struct bfq_group *bfqg;
766 -+
767 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
768 -+ bfq_end_raising_async_queues(bfqd, bfqg);
769 -+}
770 -+
771 -+/**
772 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
773 -+ * @bfqd: the device descriptor being exited.
774 -+ *
775 -+ * When the device exits we just make sure that no lookup can return
776 -+ * the now unused group structures. They will be deallocated on cgroup
777 -+ * destruction.
778 -+ */
779 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
780 -+{
781 -+ struct hlist_node *tmp;
782 -+ struct bfq_group *bfqg;
783 -+
784 -+ bfq_log(bfqd, "disconnect_groups beginning") ;
785 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
786 -+ hlist_del(&bfqg->bfqd_node);
787 -+
788 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
789 -+
790 -+ /*
791 -+ * Don't remove from the group hash, just set an
792 -+ * invalid key. No lookups can race with the
793 -+ * assignment as bfqd is being destroyed; this
794 -+ * implies also that new elements cannot be added
795 -+ * to the list.
796 -+ */
797 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
798 -+
799 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
800 -+ bfqg) ;
801 -+ bfq_put_async_queues(bfqd, bfqg);
802 -+ }
803 -+}
804 -+
805 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
806 -+{
807 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
808 -+ struct bfq_group *bfqg = bfqd->root_group;
809 -+
810 -+ bfq_put_async_queues(bfqd, bfqg);
811 -+
812 -+ spin_lock_irq(&bgrp->lock);
813 -+ hlist_del_rcu(&bfqg->group_node);
814 -+ spin_unlock_irq(&bgrp->lock);
815 -+
816 -+ /*
817 -+ * No need to synchronize_rcu() here: since the device is gone
818 -+ * there cannot be any read-side access to its root_group.
819 -+ */
820 -+ kfree(bfqg);
821 -+}
822 -+
823 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
824 -+{
825 -+ struct bfq_group *bfqg;
826 -+ struct bfqio_cgroup *bgrp;
827 -+ int i;
828 -+
829 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
830 -+ if (bfqg == NULL)
831 -+ return NULL;
832 -+
833 -+ bfqg->entity.parent = NULL;
834 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
835 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
836 -+
837 -+ bgrp = &bfqio_root_cgroup;
838 -+ spin_lock_irq(&bgrp->lock);
839 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
840 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
841 -+ spin_unlock_irq(&bgrp->lock);
842 -+
843 -+ return bfqg;
844 -+}
845 -+
846 -+#define SHOW_FUNCTION(__VAR) \
847 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
848 -+ struct cftype *cftype) \
849 -+{ \
850 -+ struct bfqio_cgroup *bgrp; \
851 -+ u64 ret = -ENODEV; \
852 -+ \
853 -+ mutex_lock(&bfqio_mutex); \
854 -+ if (bfqio_is_removed(cgroup)) \
855 -+ goto out_unlock; \
856 -+ \
857 -+ bgrp = cgroup_to_bfqio(cgroup); \
858 -+ spin_lock_irq(&bgrp->lock); \
859 -+ ret = bgrp->__VAR; \
860 -+ spin_unlock_irq(&bgrp->lock); \
861 -+ \
862 -+out_unlock: \
863 -+ mutex_unlock(&bfqio_mutex); \
864 -+ return ret; \
865 -+}
866 -+
867 -+SHOW_FUNCTION(weight);
868 -+SHOW_FUNCTION(ioprio);
869 -+SHOW_FUNCTION(ioprio_class);
870 -+#undef SHOW_FUNCTION
871 -+
872 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
873 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
874 -+ struct cftype *cftype, \
875 -+ u64 val) \
876 -+{ \
877 -+ struct bfqio_cgroup *bgrp; \
878 -+ struct bfq_group *bfqg; \
879 -+ int ret = -EINVAL; \
880 -+ \
881 -+ if (val < (__MIN) || val > (__MAX)) \
882 -+ return ret; \
883 -+ \
884 -+ ret = -ENODEV; \
885 -+ mutex_lock(&bfqio_mutex); \
886 -+ if (bfqio_is_removed(cgroup)) \
887 -+ goto out_unlock; \
888 -+ ret = 0; \
889 -+ \
890 -+ bgrp = cgroup_to_bfqio(cgroup); \
891 -+ \
892 -+ spin_lock_irq(&bgrp->lock); \
893 -+ bgrp->__VAR = (unsigned short)val; \
894 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
895 -+ /* \
896 -+ * Setting the ioprio_changed flag of the entity \
897 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
898 -+ * the value of the weight to its ioprio mapping. \
899 -+ * Set the flag only if necessary. \
900 -+ */ \
901 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
902 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
903 -+ smp_wmb(); \
904 -+ bfqg->entity.ioprio_changed = 1; \
905 -+ } \
906 -+ } \
907 -+ spin_unlock_irq(&bgrp->lock); \
908 -+ \
909 -+out_unlock: \
910 -+ mutex_unlock(&bfqio_mutex); \
911 -+ return ret; \
912 -+}
913 -+
914 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
915 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
916 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
917 -+#undef STORE_FUNCTION
918 -+
919 -+static struct cftype bfqio_files[] = {
920 -+ {
921 -+ .name = "weight",
922 -+ .read_u64 = bfqio_cgroup_weight_read,
923 -+ .write_u64 = bfqio_cgroup_weight_write,
924 -+ },
925 -+ {
926 -+ .name = "ioprio",
927 -+ .read_u64 = bfqio_cgroup_ioprio_read,
928 -+ .write_u64 = bfqio_cgroup_ioprio_write,
929 -+ },
930 -+ {
931 -+ .name = "ioprio_class",
932 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
933 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
934 -+ },
935 -+ { }, /* terminate */
936 -+};
937 -+
938 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
939 -+{
940 -+ struct bfqio_cgroup *bgrp;
941 -+
942 -+ if (cgroup->parent != NULL) {
943 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
944 -+ if (bgrp == NULL)
945 -+ return ERR_PTR(-ENOMEM);
946 -+ } else
947 -+ bgrp = &bfqio_root_cgroup;
948 -+
949 -+ spin_lock_init(&bgrp->lock);
950 -+ INIT_HLIST_HEAD(&bgrp->group_data);
951 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
952 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
953 -+
954 -+ return &bgrp->css;
955 -+}
956 -+
957 -+/*
958 -+ * We cannot support shared io contexts, as we have no means to support
959 -+ * two tasks with the same ioc in two different groups without major rework
960 -+ * of the main bic/bfqq data structures. By now we allow a task to change
961 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
962 -+ * behavior is that a group containing a task that forked using CLONE_IO
963 -+ * will not be destroyed until the tasks sharing the ioc die.
964 -+ */
965 -+static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
966 -+{
967 -+ struct task_struct *task;
968 -+ struct io_context *ioc;
969 -+ int ret = 0;
970 -+
971 -+ cgroup_taskset_for_each(task, cgroup, tset) {
972 -+ /* task_lock() is needed to avoid races with exit_io_context() */
973 -+ task_lock(task);
974 -+ ioc = task->io_context;
975 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
976 -+ /*
977 -+ * ioc == NULL means that the task is either too young or
978 -+ * exiting: if it has still no ioc the ioc can't be shared,
979 -+ * if the task is exiting the attach will fail anyway, no
980 -+ * matter what we return here.
981 -+ */
982 -+ ret = -EINVAL;
983 -+ task_unlock(task);
984 -+ if (ret)
985 -+ break;
986 -+ }
987 -+
988 -+ return ret;
989 -+}
990 -+
991 -+static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
992 -+{
993 -+ struct task_struct *task;
994 -+ struct io_context *ioc;
995 -+ struct io_cq *icq;
996 -+
997 -+ /*
998 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
999 -+ * new group has not yet been tested.
1000 -+ */
1001 -+ cgroup_taskset_for_each(task, cgroup, tset) {
1002 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1003 -+ if (ioc) {
1004 -+ /*
1005 -+ * Handle cgroup change here.
1006 -+ */
1007 -+ rcu_read_lock();
1008 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1009 -+ if (!strncmp(icq->q->elevator->type->elevator_name,
1010 -+ "bfq", ELV_NAME_MAX))
1011 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
1012 -+ cgroup);
1013 -+ rcu_read_unlock();
1014 -+ put_io_context(ioc);
1015 -+ }
1016 -+ }
1017 -+}
1018 -+
1019 -+static void bfqio_destroy(struct cgroup *cgroup)
1020 -+{
1021 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
1022 -+ struct hlist_node *tmp;
1023 -+ struct bfq_group *bfqg;
1024 -+
1025 -+ /*
1026 -+ * Since we are destroying the cgroup, there are no more tasks
1027 -+ * referencing it, and all the RCU grace periods that may have
1028 -+ * referenced it are ended (as the destruction of the parent
1029 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1030 -+ * anything else and we don't need any synchronization.
1031 -+ */
1032 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1033 -+ bfq_destroy_group(bgrp, bfqg);
1034 -+
1035 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
1036 -+
1037 -+ kfree(bgrp);
1038 -+}
1039 -+
1040 -+struct cgroup_subsys bfqio_subsys = {
1041 -+ .name = "bfqio",
1042 -+ .css_alloc = bfqio_create,
1043 -+ .can_attach = bfqio_can_attach,
1044 -+ .attach = bfqio_attach,
1045 -+ .css_free = bfqio_destroy,
1046 -+ .subsys_id = bfqio_subsys_id,
1047 -+ .base_cftypes = bfqio_files,
1048 -+};
1049 -+#else
1050 -+static inline void bfq_init_entity(struct bfq_entity *entity,
1051 -+ struct bfq_group *bfqg)
1052 -+{
1053 -+ entity->weight = entity->new_weight;
1054 -+ entity->orig_weight = entity->new_weight;
1055 -+ entity->ioprio = entity->new_ioprio;
1056 -+ entity->ioprio_class = entity->new_ioprio_class;
1057 -+ entity->sched_data = &bfqg->sched_data;
1058 -+}
1059 -+
1060 -+static inline struct bfq_group *
1061 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1062 -+{
1063 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
1064 -+ return bfqd->root_group;
1065 -+}
1066 -+
1067 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1068 -+ struct bfq_queue *bfqq,
1069 -+ struct bfq_entity *entity,
1070 -+ struct bfq_group *bfqg)
1071 -+{
1072 -+}
1073 -+
1074 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
1075 -+{
1076 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1077 -+}
1078 -+
1079 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1080 -+{
1081 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
1082 -+}
1083 -+
1084 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
1085 -+{
1086 -+ kfree(bfqd->root_group);
1087 -+}
1088 -+
1089 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1090 -+{
1091 -+ struct bfq_group *bfqg;
1092 -+ int i;
1093 -+
1094 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1095 -+ if (bfqg == NULL)
1096 -+ return NULL;
1097 -+
1098 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1099 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1100 -+
1101 -+ return bfqg;
1102 -+}
1103 -+#endif
1104 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1105 -new file mode 100644
1106 -index 0000000..326e3ec
1107 ---- /dev/null
1108 -+++ b/block/bfq-ioc.c
1109 -@@ -0,0 +1,36 @@
1110 -+/*
1111 -+ * BFQ: I/O context handling.
1112 -+ *
1113 -+ * Based on ideas and code from CFQ:
1114 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1115 -+ *
1116 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1117 -+ * Paolo Valente <paolo.valente@×××××××.it>
1118 -+ *
1119 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1120 -+ */
1121 -+
1122 -+/**
1123 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1124 -+ * @icq: the iocontext queue.
1125 -+ */
1126 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1127 -+{
1128 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
1129 -+ return container_of(icq, struct bfq_io_cq, icq);
1130 -+}
1131 -+
1132 -+/**
1133 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1134 -+ * @bfqd: the lookup key.
1135 -+ * @ioc: the io_context of the process doing I/O.
1136 -+ *
1137 -+ * Queue lock must be held.
1138 -+ */
1139 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1140 -+ struct io_context *ioc)
1141 -+{
1142 -+ if(ioc)
1143 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1144 -+ return NULL;
1145 -+}
1146 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1147 -new file mode 100644
1148 -index 0000000..b230927
1149 ---- /dev/null
1150 -+++ b/block/bfq-iosched.c
1151 -@@ -0,0 +1,3070 @@
1152 -+/*
1153 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
1154 -+ *
1155 -+ * Based on ideas and code from CFQ:
1156 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1157 -+ *
1158 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1159 -+ * Paolo Valente <paolo.valente@×××××××.it>
1160 -+ *
1161 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1162 -+ *
1163 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1164 -+ *
1165 -+ * BFQ is a proportional share disk scheduling algorithm based on the
1166 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1167 -+ * measured in number of sectors, to tasks instead of time slices.
1168 -+ * The disk is not granted to the active task for a given time slice,
1169 -+ * but until it has exahusted its assigned budget. This change from
1170 -+ * the time to the service domain allows BFQ to distribute the disk
1171 -+ * bandwidth among tasks as desired, without any distortion due to
1172 -+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
1173 -+ * internal scheduler, called B-WF2Q+, to schedule tasks according to
1174 -+ * their budgets. Thanks to this accurate scheduler, BFQ can afford
1175 -+ * to assign high budgets to disk-bound non-seeky tasks (to boost the
1176 -+ * throughput), and yet guarantee low latencies to interactive and
1177 -+ * soft real-time applications.
1178 -+ *
1179 -+ * BFQ has been introduced in [1], where the interested reader can
1180 -+ * find an accurate description of the algorithm, the bandwidth
1181 -+ * distribution and latency guarantees it provides, plus formal proofs
1182 -+ * of all the properties. With respect to the algorithm presented in
1183 -+ * the paper, this implementation adds several little heuristics, and
1184 -+ * a hierarchical extension, based on H-WF2Q+.
1185 -+ *
1186 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1187 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1188 -+ * complexity derives from the one introduced with EEVDF in [3].
1189 -+ *
1190 -+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
1191 -+ * with Deterministic Guarantees on Bandwidth Distribution,'',
1192 -+ * IEEE Transactions on Computer, May 2010.
1193 -+ *
1194 -+ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
1195 -+ *
1196 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1197 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1198 -+ * Oct 1997.
1199 -+ *
1200 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1201 -+ *
1202 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1203 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
1204 -+ * Resource Allocation,'' technical report.
1205 -+ *
1206 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1207 -+ */
1208 -+#include <linux/module.h>
1209 -+#include <linux/slab.h>
1210 -+#include <linux/blkdev.h>
1211 -+#include <linux/cgroup.h>
1212 -+#include <linux/elevator.h>
1213 -+#include <linux/jiffies.h>
1214 -+#include <linux/rbtree.h>
1215 -+#include <linux/ioprio.h>
1216 -+#include "bfq.h"
1217 -+#include "blk.h"
1218 -+
1219 -+/* Max number of dispatches in one round of service. */
1220 -+static const int bfq_quantum = 4;
1221 -+
1222 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1223 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1224 -+
1225 -+/* Maximum backwards seek, in KiB. */
1226 -+static const int bfq_back_max = 16 * 1024;
1227 -+
1228 -+/* Penalty of a backwards seek, in number of sectors. */
1229 -+static const int bfq_back_penalty = 2;
1230 -+
1231 -+/* Idling period duration, in jiffies. */
1232 -+static int bfq_slice_idle = HZ / 125;
1233 -+
1234 -+/* Default maximum budget values, in sectors and number of requests. */
1235 -+static const int bfq_default_max_budget = 16 * 1024;
1236 -+static const int bfq_max_budget_async_rq = 4;
1237 -+
1238 -+/*
1239 -+ * Async to sync throughput distribution is controlled as follows:
1240 -+ * when an async request is served, the entity is charged the number
1241 -+ * of sectors of the request, multipled by the factor below
1242 -+ */
1243 -+static const int bfq_async_charge_factor = 10;
1244 -+
1245 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
1246 -+static const int bfq_timeout_sync = HZ / 8;
1247 -+static int bfq_timeout_async = HZ / 25;
1248 -+
1249 -+struct kmem_cache *bfq_pool;
1250 -+
1251 -+/* Below this threshold (in ms), we consider thinktime immediate. */
1252 -+#define BFQ_MIN_TT 2
1253 -+
1254 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
1255 -+#define BFQ_HW_QUEUE_THRESHOLD 4
1256 -+#define BFQ_HW_QUEUE_SAMPLES 32
1257 -+
1258 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1259 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1260 -+
1261 -+/* Min samples used for peak rate estimation (for autotuning). */
1262 -+#define BFQ_PEAK_RATE_SAMPLES 32
1263 -+
1264 -+/* Shift used for peak rate fixed precision calculations. */
1265 -+#define BFQ_RATE_SHIFT 16
1266 -+
1267 -+/*
1268 -+ * The duration of the weight raising for interactive applications is
1269 -+ * computed automatically (as default behaviour), using the following
1270 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
1271 -+ * disk, and R and T are two reference parameters. In particular, R is
1272 -+ * the peak rate of a reference disk, and T is about the maximum time
1273 -+ * for starting popular large applications on that disk, under BFQ and
1274 -+ * while reading two files in parallel. Finally, BFQ uses two
1275 -+ * different pairs (R, T) depending on whether the disk is rotational
1276 -+ * or non-rotational.
1277 -+ */
1278 -+#define T_rot (msecs_to_jiffies(5500))
1279 -+#define T_nonrot (msecs_to_jiffies(2000))
1280 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1281 -+#define R_rot 17415
1282 -+#define R_nonrot 34791
1283 -+
1284 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1285 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1286 -+
1287 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1288 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1289 -+
1290 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1291 -+
1292 -+#include "bfq-ioc.c"
1293 -+#include "bfq-sched.c"
1294 -+#include "bfq-cgroup.c"
1295 -+
1296 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1297 -+ IOPRIO_CLASS_IDLE)
1298 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1299 -+ IOPRIO_CLASS_RT)
1300 -+
1301 -+#define bfq_sample_valid(samples) ((samples) > 80)
1302 -+
1303 -+/*
1304 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1305 -+ * set (in which case it could also be a direct WRITE).
1306 -+ */
1307 -+static inline int bfq_bio_sync(struct bio *bio)
1308 -+{
1309 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1310 -+ return 1;
1311 -+
1312 -+ return 0;
1313 -+}
1314 -+
1315 -+/*
1316 -+ * Scheduler run of queue, if there are requests pending and no one in the
1317 -+ * driver that will restart queueing.
1318 -+ */
1319 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1320 -+{
1321 -+ if (bfqd->queued != 0) {
1322 -+ bfq_log(bfqd, "schedule dispatch");
1323 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1324 -+ }
1325 -+}
1326 -+
1327 -+/*
1328 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1329 -+ * We choose the request that is closesr to the head right now. Distance
1330 -+ * behind the head is penalized and only allowed to a certain extent.
1331 -+ */
1332 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
1333 -+ struct request *rq1,
1334 -+ struct request *rq2,
1335 -+ sector_t last)
1336 -+{
1337 -+ sector_t s1, s2, d1 = 0, d2 = 0;
1338 -+ unsigned long back_max;
1339 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1340 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1341 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1342 -+
1343 -+ if (rq1 == NULL || rq1 == rq2)
1344 -+ return rq2;
1345 -+ if (rq2 == NULL)
1346 -+ return rq1;
1347 -+
1348 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1349 -+ return rq1;
1350 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1351 -+ return rq2;
1352 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1353 -+ return rq1;
1354 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1355 -+ return rq2;
1356 -+
1357 -+ s1 = blk_rq_pos(rq1);
1358 -+ s2 = blk_rq_pos(rq2);
1359 -+
1360 -+ /*
1361 -+ * By definition, 1KiB is 2 sectors.
1362 -+ */
1363 -+ back_max = bfqd->bfq_back_max * 2;
1364 -+
1365 -+ /*
1366 -+ * Strict one way elevator _except_ in the case where we allow
1367 -+ * short backward seeks which are biased as twice the cost of a
1368 -+ * similar forward seek.
1369 -+ */
1370 -+ if (s1 >= last)
1371 -+ d1 = s1 - last;
1372 -+ else if (s1 + back_max >= last)
1373 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
1374 -+ else
1375 -+ wrap |= BFQ_RQ1_WRAP;
1376 -+
1377 -+ if (s2 >= last)
1378 -+ d2 = s2 - last;
1379 -+ else if (s2 + back_max >= last)
1380 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
1381 -+ else
1382 -+ wrap |= BFQ_RQ2_WRAP;
1383 -+
1384 -+ /* Found required data */
1385 -+
1386 -+ /*
1387 -+ * By doing switch() on the bit mask "wrap" we avoid having to
1388 -+ * check two variables for all permutations: --> faster!
1389 -+ */
1390 -+ switch (wrap) {
1391 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1392 -+ if (d1 < d2)
1393 -+ return rq1;
1394 -+ else if (d2 < d1)
1395 -+ return rq2;
1396 -+ else {
1397 -+ if (s1 >= s2)
1398 -+ return rq1;
1399 -+ else
1400 -+ return rq2;
1401 -+ }
1402 -+
1403 -+ case BFQ_RQ2_WRAP:
1404 -+ return rq1;
1405 -+ case BFQ_RQ1_WRAP:
1406 -+ return rq2;
1407 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1408 -+ default:
1409 -+ /*
1410 -+ * Since both rqs are wrapped,
1411 -+ * start with the one that's further behind head
1412 -+ * (--> only *one* back seek required),
1413 -+ * since back seek takes more time than forward.
1414 -+ */
1415 -+ if (s1 <= s2)
1416 -+ return rq1;
1417 -+ else
1418 -+ return rq2;
1419 -+ }
1420 -+}
1421 -+
1422 -+static struct bfq_queue *
1423 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1424 -+ sector_t sector, struct rb_node **ret_parent,
1425 -+ struct rb_node ***rb_link)
1426 -+{
1427 -+ struct rb_node **p, *parent;
1428 -+ struct bfq_queue *bfqq = NULL;
1429 -+
1430 -+ parent = NULL;
1431 -+ p = &root->rb_node;
1432 -+ while (*p) {
1433 -+ struct rb_node **n;
1434 -+
1435 -+ parent = *p;
1436 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1437 -+
1438 -+ /*
1439 -+ * Sort strictly based on sector. Smallest to the left,
1440 -+ * largest to the right.
1441 -+ */
1442 -+ if (sector > blk_rq_pos(bfqq->next_rq))
1443 -+ n = &(*p)->rb_right;
1444 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
1445 -+ n = &(*p)->rb_left;
1446 -+ else
1447 -+ break;
1448 -+ p = n;
1449 -+ bfqq = NULL;
1450 -+ }
1451 -+
1452 -+ *ret_parent = parent;
1453 -+ if (rb_link)
1454 -+ *rb_link = p;
1455 -+
1456 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1457 -+ (long long unsigned)sector,
1458 -+ bfqq != NULL ? bfqq->pid : 0);
1459 -+
1460 -+ return bfqq;
1461 -+}
1462 -+
1463 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1464 -+{
1465 -+ struct rb_node **p, *parent;
1466 -+ struct bfq_queue *__bfqq;
1467 -+
1468 -+ if (bfqq->pos_root != NULL) {
1469 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1470 -+ bfqq->pos_root = NULL;
1471 -+ }
1472 -+
1473 -+ if (bfq_class_idle(bfqq))
1474 -+ return;
1475 -+ if (!bfqq->next_rq)
1476 -+ return;
1477 -+
1478 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
1479 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1480 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
1481 -+ if (__bfqq == NULL) {
1482 -+ rb_link_node(&bfqq->pos_node, parent, p);
1483 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1484 -+ } else
1485 -+ bfqq->pos_root = NULL;
1486 -+}
1487 -+
1488 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1489 -+ struct bfq_queue *bfqq,
1490 -+ struct request *last)
1491 -+{
1492 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
1493 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
1494 -+ struct request *next = NULL, *prev = NULL;
1495 -+
1496 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1497 -+
1498 -+ if (rbprev != NULL)
1499 -+ prev = rb_entry_rq(rbprev);
1500 -+
1501 -+ if (rbnext != NULL)
1502 -+ next = rb_entry_rq(rbnext);
1503 -+ else {
1504 -+ rbnext = rb_first(&bfqq->sort_list);
1505 -+ if (rbnext && rbnext != &last->rb_node)
1506 -+ next = rb_entry_rq(rbnext);
1507 -+ }
1508 -+
1509 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1510 -+}
1511 -+
1512 -+static void bfq_del_rq_rb(struct request *rq)
1513 -+{
1514 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1515 -+ struct bfq_data *bfqd = bfqq->bfqd;
1516 -+ const int sync = rq_is_sync(rq);
1517 -+
1518 -+ BUG_ON(bfqq->queued[sync] == 0);
1519 -+ bfqq->queued[sync]--;
1520 -+ bfqd->queued--;
1521 -+
1522 -+ elv_rb_del(&bfqq->sort_list, rq);
1523 -+
1524 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1525 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
1526 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1527 -+ /*
1528 -+ * Remove queue from request-position tree as it is empty.
1529 -+ */
1530 -+ if (bfqq->pos_root != NULL) {
1531 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1532 -+ bfqq->pos_root = NULL;
1533 -+ }
1534 -+ }
1535 -+}
1536 -+
1537 -+/* see the definition of bfq_async_charge_factor for details */
1538 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
1539 -+ struct bfq_queue *bfqq)
1540 -+{
1541 -+ return blk_rq_sectors(rq) *
1542 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1543 -+ bfq_async_charge_factor));
1544 -+}
1545 -+
1546 -+/**
1547 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
1548 -+ * @bfqd: the device data the queue belongs to.
1549 -+ * @bfqq: the queue to update.
1550 -+ *
1551 -+ * If the first request of a queue changes we make sure that the queue
1552 -+ * has enough budget to serve at least its first request (if the
1553 -+ * request has grown). We do this because if the queue has not enough
1554 -+ * budget for its first request, it has to go through two dispatch
1555 -+ * rounds to actually get it dispatched.
1556 -+ */
1557 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
1558 -+ struct bfq_queue *bfqq)
1559 -+{
1560 -+ struct bfq_entity *entity = &bfqq->entity;
1561 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1562 -+ struct request *next_rq = bfqq->next_rq;
1563 -+ unsigned long new_budget;
1564 -+
1565 -+ if (next_rq == NULL)
1566 -+ return;
1567 -+
1568 -+ if (bfqq == bfqd->active_queue)
1569 -+ /*
1570 -+ * In order not to break guarantees, budgets cannot be
1571 -+ * changed after an entity has been selected.
1572 -+ */
1573 -+ return;
1574 -+
1575 -+ BUG_ON(entity->tree != &st->active);
1576 -+ BUG_ON(entity == entity->sched_data->active_entity);
1577 -+
1578 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
1579 -+ bfq_serv_to_charge(next_rq, bfqq));
1580 -+ entity->budget = new_budget;
1581 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1582 -+ bfq_activate_bfqq(bfqd, bfqq);
1583 -+}
1584 -+
1585 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1586 -+{
1587 -+ u64 dur;
1588 -+
1589 -+ if (bfqd->bfq_raising_max_time > 0)
1590 -+ return bfqd->bfq_raising_max_time;
1591 -+
1592 -+ dur = bfqd->RT_prod;
1593 -+ do_div(dur, bfqd->peak_rate);
1594 -+
1595 -+ return dur;
1596 -+}
1597 -+
1598 -+static void bfq_add_rq_rb(struct request *rq)
1599 -+{
1600 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1601 -+ struct bfq_entity *entity = &bfqq->entity;
1602 -+ struct bfq_data *bfqd = bfqq->bfqd;
1603 -+ struct request *next_rq, *prev;
1604 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
1605 -+ int idle_for_long_time = bfqq->budget_timeout +
1606 -+ bfqd->bfq_raising_min_idle_time < jiffies;
1607 -+
1608 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1609 -+ bfqq->queued[rq_is_sync(rq)]++;
1610 -+ bfqd->queued++;
1611 -+
1612 -+ elv_rb_add(&bfqq->sort_list, rq);
1613 -+
1614 -+ /*
1615 -+ * Check if this request is a better next-serve candidate.
1616 -+ */
1617 -+ prev = bfqq->next_rq;
1618 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1619 -+ BUG_ON(next_rq == NULL);
1620 -+ bfqq->next_rq = next_rq;
1621 -+
1622 -+ /*
1623 -+ * Adjust priority tree position, if next_rq changes.
1624 -+ */
1625 -+ if (prev != bfqq->next_rq)
1626 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
1627 -+
1628 -+ if (!bfq_bfqq_busy(bfqq)) {
1629 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1630 -+ bfqq->soft_rt_next_start < jiffies;
1631 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
1632 -+ bfq_serv_to_charge(next_rq, bfqq));
1633 -+
1634 -+ if (! bfqd->low_latency)
1635 -+ goto add_bfqq_busy;
1636 -+
1637 -+ /*
1638 -+ * If the queue is not being boosted and has been idle
1639 -+ * for enough time, start a weight-raising period
1640 -+ */
1641 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
1642 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1643 -+ if (idle_for_long_time)
1644 -+ bfqq->raising_cur_max_time =
1645 -+ bfq_wrais_duration(bfqd);
1646 -+ else
1647 -+ bfqq->raising_cur_max_time =
1648 -+ bfqd->bfq_raising_rt_max_time;
1649 -+ bfq_log_bfqq(bfqd, bfqq,
1650 -+ "wrais starting at %llu msec,"
1651 -+ "rais_max_time %u",
1652 -+ bfqq->last_rais_start_finish,
1653 -+ jiffies_to_msecs(bfqq->
1654 -+ raising_cur_max_time));
1655 -+ } else if (old_raising_coeff > 1) {
1656 -+ if (idle_for_long_time)
1657 -+ bfqq->raising_cur_max_time =
1658 -+ bfq_wrais_duration(bfqd);
1659 -+ else if (bfqq->raising_cur_max_time ==
1660 -+ bfqd->bfq_raising_rt_max_time &&
1661 -+ !soft_rt) {
1662 -+ bfqq->raising_coeff = 1;
1663 -+ bfq_log_bfqq(bfqd, bfqq,
1664 -+ "wrais ending at %llu msec,"
1665 -+ "rais_max_time %u",
1666 -+ bfqq->last_rais_start_finish,
1667 -+ jiffies_to_msecs(bfqq->
1668 -+ raising_cur_max_time));
1669 -+ }
1670 -+ }
1671 -+ if (old_raising_coeff != bfqq->raising_coeff)
1672 -+ entity->ioprio_changed = 1;
1673 -+add_bfqq_busy:
1674 -+ bfq_add_bfqq_busy(bfqd, bfqq);
1675 -+ } else {
1676 -+ if(bfqd->low_latency && old_raising_coeff == 1 &&
1677 -+ !rq_is_sync(rq) &&
1678 -+ bfqq->last_rais_start_finish +
1679 -+ bfqd->bfq_raising_min_inter_arr_async < jiffies) {
1680 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1681 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1682 -+
1683 -+ entity->ioprio_changed = 1;
1684 -+ bfq_log_bfqq(bfqd, bfqq,
1685 -+ "non-idle wrais starting at %llu msec,"
1686 -+ "rais_max_time %u",
1687 -+ bfqq->last_rais_start_finish,
1688 -+ jiffies_to_msecs(bfqq->
1689 -+ raising_cur_max_time));
1690 -+ }
1691 -+ bfq_updated_next_req(bfqd, bfqq);
1692 -+ }
1693 -+
1694 -+ if(bfqd->low_latency &&
1695 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1696 -+ idle_for_long_time))
1697 -+ bfqq->last_rais_start_finish = jiffies;
1698 -+}
1699 -+
1700 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1701 -+{
1702 -+ elv_rb_del(&bfqq->sort_list, rq);
1703 -+ bfqq->queued[rq_is_sync(rq)]--;
1704 -+ bfqq->bfqd->queued--;
1705 -+ bfq_add_rq_rb(rq);
1706 -+}
1707 -+
1708 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1709 -+ struct bio *bio)
1710 -+{
1711 -+ struct task_struct *tsk = current;
1712 -+ struct bfq_io_cq *bic;
1713 -+ struct bfq_queue *bfqq;
1714 -+
1715 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1716 -+ if (bic == NULL)
1717 -+ return NULL;
1718 -+
1719 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1720 -+ if (bfqq != NULL) {
1721 -+ sector_t sector = bio->bi_sector + bio_sectors(bio);
1722 -+
1723 -+ return elv_rb_find(&bfqq->sort_list, sector);
1724 -+ }
1725 -+
1726 -+ return NULL;
1727 -+}
1728 -+
1729 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
1730 -+{
1731 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1732 -+
1733 -+ bfqd->rq_in_driver++;
1734 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1735 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1736 -+ (long long unsigned)bfqd->last_position);
1737 -+}
1738 -+
1739 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1740 -+{
1741 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1742 -+
1743 -+ WARN_ON(bfqd->rq_in_driver == 0);
1744 -+ bfqd->rq_in_driver--;
1745 -+}
1746 -+
1747 -+static void bfq_remove_request(struct request *rq)
1748 -+{
1749 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1750 -+ struct bfq_data *bfqd = bfqq->bfqd;
1751 -+
1752 -+ if (bfqq->next_rq == rq) {
1753 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1754 -+ bfq_updated_next_req(bfqd, bfqq);
1755 -+ }
1756 -+
1757 -+ list_del_init(&rq->queuelist);
1758 -+ bfq_del_rq_rb(rq);
1759 -+
1760 -+ if (rq->cmd_flags & REQ_META) {
1761 -+ WARN_ON(bfqq->meta_pending == 0);
1762 -+ bfqq->meta_pending--;
1763 -+ }
1764 -+}
1765 -+
1766 -+static int bfq_merge(struct request_queue *q, struct request **req,
1767 -+ struct bio *bio)
1768 -+{
1769 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1770 -+ struct request *__rq;
1771 -+
1772 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
1773 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1774 -+ *req = __rq;
1775 -+ return ELEVATOR_FRONT_MERGE;
1776 -+ }
1777 -+
1778 -+ return ELEVATOR_NO_MERGE;
1779 -+}
1780 -+
1781 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
1782 -+ int type)
1783 -+{
1784 -+ if (type == ELEVATOR_FRONT_MERGE) {
1785 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1786 -+
1787 -+ bfq_reposition_rq_rb(bfqq, req);
1788 -+ }
1789 -+}
1790 -+
1791 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1792 -+ struct request *next)
1793 -+{
1794 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1795 -+
1796 -+ /*
1797 -+ * Reposition in fifo if next is older than rq.
1798 -+ */
1799 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1800 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1801 -+ list_move(&rq->queuelist, &next->queuelist);
1802 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
1803 -+ }
1804 -+
1805 -+ if (bfqq->next_rq == next)
1806 -+ bfqq->next_rq = rq;
1807 -+
1808 -+ bfq_remove_request(next);
1809 -+}
1810 -+
1811 -+/* Must be called with bfqq != NULL */
1812 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
1813 -+{
1814 -+ BUG_ON(bfqq == NULL);
1815 -+ bfqq->raising_coeff = 1;
1816 -+ bfqq->raising_cur_max_time = 0;
1817 -+ /* Trigger a weight change on the next activation of the queue */
1818 -+ bfqq->entity.ioprio_changed = 1;
1819 -+}
1820 -+
1821 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
1822 -+ struct bfq_group *bfqg)
1823 -+{
1824 -+ int i, j;
1825 -+
1826 -+ for (i = 0; i < 2; i++)
1827 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
1828 -+ if (bfqg->async_bfqq[i][j] != NULL)
1829 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
1830 -+ if (bfqg->async_idle_bfqq != NULL)
1831 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
1832 -+}
1833 -+
1834 -+static void bfq_end_raising(struct bfq_data *bfqd)
1835 -+{
1836 -+ struct bfq_queue *bfqq;
1837 -+
1838 -+ spin_lock_irq(bfqd->queue->queue_lock);
1839 -+
1840 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1841 -+ bfq_bfqq_end_raising(bfqq);
1842 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1843 -+ bfq_bfqq_end_raising(bfqq);
1844 -+ bfq_end_raising_async(bfqd);
1845 -+
1846 -+ spin_unlock_irq(bfqd->queue->queue_lock);
1847 -+}
1848 -+
1849 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1850 -+ struct bio *bio)
1851 -+{
1852 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1853 -+ struct bfq_io_cq *bic;
1854 -+ struct bfq_queue *bfqq;
1855 -+
1856 -+ /*
1857 -+ * Disallow merge of a sync bio into an async request.
1858 -+ */
1859 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1860 -+ return 0;
1861 -+
1862 -+ /*
1863 -+ * Lookup the bfqq that this bio will be queued with. Allow
1864 -+ * merge only if rq is queued there.
1865 -+ * Queue lock is held here.
1866 -+ */
1867 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
1868 -+ if (bic == NULL)
1869 -+ return 0;
1870 -+
1871 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1872 -+ return bfqq == RQ_BFQQ(rq);
1873 -+}
1874 -+
1875 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
1876 -+ struct bfq_queue *bfqq)
1877 -+{
1878 -+ if (bfqq != NULL) {
1879 -+ bfq_mark_bfqq_must_alloc(bfqq);
1880 -+ bfq_mark_bfqq_budget_new(bfqq);
1881 -+ bfq_clear_bfqq_fifo_expire(bfqq);
1882 -+
1883 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1884 -+
1885 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
1886 -+ bfqq->entity.budget);
1887 -+ }
1888 -+
1889 -+ bfqd->active_queue = bfqq;
1890 -+}
1891 -+
1892 -+/*
1893 -+ * Get and set a new active queue for service.
1894 -+ */
1895 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
1896 -+ struct bfq_queue *bfqq)
1897 -+{
1898 -+ if (!bfqq)
1899 -+ bfqq = bfq_get_next_queue(bfqd);
1900 -+ else
1901 -+ bfq_get_next_queue_forced(bfqd, bfqq);
1902 -+
1903 -+ __bfq_set_active_queue(bfqd, bfqq);
1904 -+ return bfqq;
1905 -+}
1906 -+
1907 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
1908 -+ struct request *rq)
1909 -+{
1910 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
1911 -+ return blk_rq_pos(rq) - bfqd->last_position;
1912 -+ else
1913 -+ return bfqd->last_position - blk_rq_pos(rq);
1914 -+}
1915 -+
1916 -+/*
1917 -+ * Return true if bfqq has no request pending and rq is close enough to
1918 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
1919 -+ * bfqq->next_rq
1920 -+ */
1921 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
1922 -+{
1923 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
1924 -+}
1925 -+
1926 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
1927 -+{
1928 -+ struct rb_root *root = &bfqd->rq_pos_tree;
1929 -+ struct rb_node *parent, *node;
1930 -+ struct bfq_queue *__bfqq;
1931 -+ sector_t sector = bfqd->last_position;
1932 -+
1933 -+ if (RB_EMPTY_ROOT(root))
1934 -+ return NULL;
1935 -+
1936 -+ /*
1937 -+ * First, if we find a request starting at the end of the last
1938 -+ * request, choose it.
1939 -+ */
1940 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1941 -+ if (__bfqq != NULL)
1942 -+ return __bfqq;
1943 -+
1944 -+ /*
1945 -+ * If the exact sector wasn't found, the parent of the NULL leaf
1946 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
1947 -+ * position).
1948 -+ */
1949 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1950 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1951 -+ return __bfqq;
1952 -+
1953 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
1954 -+ node = rb_next(&__bfqq->pos_node);
1955 -+ else
1956 -+ node = rb_prev(&__bfqq->pos_node);
1957 -+ if (node == NULL)
1958 -+ return NULL;
1959 -+
1960 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1961 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1962 -+ return __bfqq;
1963 -+
1964 -+ return NULL;
1965 -+}
1966 -+
1967 -+/*
1968 -+ * bfqd - obvious
1969 -+ * cur_bfqq - passed in so that we don't decide that the current queue
1970 -+ * is closely cooperating with itself.
1971 -+ *
1972 -+ * We are assuming that cur_bfqq has dispatched at least one request,
1973 -+ * and that bfqd->last_position reflects a position on the disk associated
1974 -+ * with the I/O issued by cur_bfqq.
1975 -+ */
1976 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
1977 -+ struct bfq_queue *cur_bfqq)
1978 -+{
1979 -+ struct bfq_queue *bfqq;
1980 -+
1981 -+ if (bfq_class_idle(cur_bfqq))
1982 -+ return NULL;
1983 -+ if (!bfq_bfqq_sync(cur_bfqq))
1984 -+ return NULL;
1985 -+ if (BFQQ_SEEKY(cur_bfqq))
1986 -+ return NULL;
1987 -+
1988 -+ /* If device has only one backlogged bfq_queue, don't search. */
1989 -+ if (bfqd->busy_queues == 1)
1990 -+ return NULL;
1991 -+
1992 -+ /*
1993 -+ * We should notice if some of the queues are cooperating, e.g.
1994 -+ * working closely on the same area of the disk. In that case,
1995 -+ * we can group them together and don't waste time idling.
1996 -+ */
1997 -+ bfqq = bfqq_close(bfqd);
1998 -+ if (bfqq == NULL || bfqq == cur_bfqq)
1999 -+ return NULL;
2000 -+
2001 -+ /*
2002 -+ * Do not merge queues from different bfq_groups.
2003 -+ */
2004 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2005 -+ return NULL;
2006 -+
2007 -+ /*
2008 -+ * It only makes sense to merge sync queues.
2009 -+ */
2010 -+ if (!bfq_bfqq_sync(bfqq))
2011 -+ return NULL;
2012 -+ if (BFQQ_SEEKY(bfqq))
2013 -+ return NULL;
2014 -+
2015 -+ /*
2016 -+ * Do not merge queues of different priority classes.
2017 -+ */
2018 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2019 -+ return NULL;
2020 -+
2021 -+ return bfqq;
2022 -+}
2023 -+
2024 -+/*
2025 -+ * If enough samples have been computed, return the current max budget
2026 -+ * stored in bfqd, which is dynamically updated according to the
2027 -+ * estimated disk peak rate; otherwise return the default max budget
2028 -+ */
2029 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2030 -+{
2031 -+ if (bfqd->budgets_assigned < 194)
2032 -+ return bfq_default_max_budget;
2033 -+ else
2034 -+ return bfqd->bfq_max_budget;
2035 -+}
2036 -+
2037 -+/*
2038 -+ * Return min budget, which is a fraction of the current or default
2039 -+ * max budget (trying with 1/32)
2040 -+ */
2041 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2042 -+{
2043 -+ if (bfqd->budgets_assigned < 194)
2044 -+ return bfq_default_max_budget / 32;
2045 -+ else
2046 -+ return bfqd->bfq_max_budget / 32;
2047 -+}
2048 -+
2049 -+/*
2050 -+ * Decides whether idling should be done for given device and
2051 -+ * given active queue.
2052 -+ */
2053 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2054 -+ struct bfq_queue *active_bfqq)
2055 -+{
2056 -+ if (active_bfqq == NULL)
2057 -+ return false;
2058 -+ /*
2059 -+ * If device is SSD it has no seek penalty, disable idling; but
2060 -+ * do so only if:
2061 -+ * - device does not support queuing, otherwise we still have
2062 -+ * a problem with sync vs async workloads;
2063 -+ * - the queue is not weight-raised, to preserve guarantees.
2064 -+ */
2065 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2066 -+ active_bfqq->raising_coeff == 1);
2067 -+}
2068 -+
2069 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2070 -+{
2071 -+ struct bfq_queue *bfqq = bfqd->active_queue;
2072 -+ struct bfq_io_cq *bic;
2073 -+ unsigned long sl;
2074 -+
2075 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2076 -+
2077 -+ /* Tasks have exited, don't wait. */
2078 -+ bic = bfqd->active_bic;
2079 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2080 -+ return;
2081 -+
2082 -+ bfq_mark_bfqq_wait_request(bfqq);
2083 -+
2084 -+ /*
2085 -+ * We don't want to idle for seeks, but we do want to allow
2086 -+ * fair distribution of slice time for a process doing back-to-back
2087 -+ * seeks. So allow a little bit of time for him to submit a new rq.
2088 -+ *
2089 -+ * To prevent processes with (partly) seeky workloads from
2090 -+ * being too ill-treated, grant them a small fraction of the
2091 -+ * assigned budget before reducing the waiting time to
2092 -+ * BFQ_MIN_TT. This happened to help reduce latency.
2093 -+ */
2094 -+ sl = bfqd->bfq_slice_idle;
2095 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2096 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2097 -+ bfqq->raising_coeff == 1)
2098 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2099 -+ else if (bfqq->raising_coeff > 1)
2100 -+ sl = sl * 3;
2101 -+ bfqd->last_idling_start = ktime_get();
2102 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2103 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
2104 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2105 -+}
2106 -+
2107 -+/*
2108 -+ * Set the maximum time for the active queue to consume its
2109 -+ * budget. This prevents seeky processes from lowering the disk
2110 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
2111 -+ */
2112 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2113 -+{
2114 -+ struct bfq_queue *bfqq = bfqd->active_queue;
2115 -+ unsigned int timeout_coeff;
2116 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2117 -+ timeout_coeff = 1;
2118 -+ else
2119 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2120 -+
2121 -+ bfqd->last_budget_start = ktime_get();
2122 -+
2123 -+ bfq_clear_bfqq_budget_new(bfqq);
2124 -+ bfqq->budget_timeout = jiffies +
2125 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2126 -+
2127 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2128 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2129 -+ timeout_coeff));
2130 -+}
2131 -+
2132 -+/*
2133 -+ * Move request from internal lists to the request queue dispatch list.
2134 -+ */
2135 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2136 -+{
2137 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2138 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2139 -+
2140 -+ bfq_remove_request(rq);
2141 -+ bfqq->dispatched++;
2142 -+ elv_dispatch_sort(q, rq);
2143 -+
2144 -+ if (bfq_bfqq_sync(bfqq))
2145 -+ bfqd->sync_flight++;
2146 -+}
2147 -+
2148 -+/*
2149 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
2150 -+ */
2151 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2152 -+{
2153 -+ struct request *rq = NULL;
2154 -+
2155 -+ if (bfq_bfqq_fifo_expire(bfqq))
2156 -+ return NULL;
2157 -+
2158 -+ bfq_mark_bfqq_fifo_expire(bfqq);
2159 -+
2160 -+ if (list_empty(&bfqq->fifo))
2161 -+ return NULL;
2162 -+
2163 -+ rq = rq_entry_fifo(bfqq->fifo.next);
2164 -+
2165 -+ if (time_before(jiffies, rq_fifo_time(rq)))
2166 -+ return NULL;
2167 -+
2168 -+ return rq;
2169 -+}
2170 -+
2171 -+/*
2172 -+ * Must be called with the queue_lock held.
2173 -+ */
2174 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
2175 -+{
2176 -+ int process_refs, io_refs;
2177 -+
2178 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2179 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2180 -+ BUG_ON(process_refs < 0);
2181 -+ return process_refs;
2182 -+}
2183 -+
2184 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2185 -+{
2186 -+ int process_refs, new_process_refs;
2187 -+ struct bfq_queue *__bfqq;
2188 -+
2189 -+ /*
2190 -+ * If there are no process references on the new_bfqq, then it is
2191 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2192 -+ * may have dropped their last reference (not just their last process
2193 -+ * reference).
2194 -+ */
2195 -+ if (!bfqq_process_refs(new_bfqq))
2196 -+ return;
2197 -+
2198 -+ /* Avoid a circular list and skip interim queue merges. */
2199 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
2200 -+ if (__bfqq == bfqq)
2201 -+ return;
2202 -+ new_bfqq = __bfqq;
2203 -+ }
2204 -+
2205 -+ process_refs = bfqq_process_refs(bfqq);
2206 -+ new_process_refs = bfqq_process_refs(new_bfqq);
2207 -+ /*
2208 -+ * If the process for the bfqq has gone away, there is no
2209 -+ * sense in merging the queues.
2210 -+ */
2211 -+ if (process_refs == 0 || new_process_refs == 0)
2212 -+ return;
2213 -+
2214 -+ /*
2215 -+ * Merge in the direction of the lesser amount of work.
2216 -+ */
2217 -+ if (new_process_refs >= process_refs) {
2218 -+ bfqq->new_bfqq = new_bfqq;
2219 -+ atomic_add(process_refs, &new_bfqq->ref);
2220 -+ } else {
2221 -+ new_bfqq->new_bfqq = bfqq;
2222 -+ atomic_add(new_process_refs, &bfqq->ref);
2223 -+ }
2224 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2225 -+ new_bfqq->pid);
2226 -+}
2227 -+
2228 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2229 -+{
2230 -+ struct bfq_entity *entity = &bfqq->entity;
2231 -+ return entity->budget - entity->service;
2232 -+}
2233 -+
2234 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2235 -+{
2236 -+ BUG_ON(bfqq != bfqd->active_queue);
2237 -+
2238 -+ __bfq_bfqd_reset_active(bfqd);
2239 -+
2240 -+ /*
2241 -+ * If this bfqq is shared between multiple processes, check
2242 -+ * to make sure that those processes are still issuing I/Os
2243 -+ * within the mean seek distance. If not, it may be time to
2244 -+ * break the queues apart again.
2245 -+ */
2246 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2247 -+ bfq_mark_bfqq_split_coop(bfqq);
2248 -+
2249 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2250 -+ /*
2251 -+ * overloading budget_timeout field to store when
2252 -+ * the queue remains with no backlog, used by
2253 -+ * the weight-raising mechanism
2254 -+ */
2255 -+ bfqq->budget_timeout = jiffies ;
2256 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2257 -+ } else {
2258 -+ bfq_activate_bfqq(bfqd, bfqq);
2259 -+ /*
2260 -+ * Resort priority tree of potential close cooperators.
2261 -+ */
2262 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
2263 -+ }
2264 -+}
2265 -+
2266 -+/**
2267 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2268 -+ * @bfqd: device data.
2269 -+ * @bfqq: queue to update.
2270 -+ * @reason: reason for expiration.
2271 -+ *
2272 -+ * Handle the feedback on @bfqq budget. See the body for detailed
2273 -+ * comments.
2274 -+ */
2275 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2276 -+ struct bfq_queue *bfqq,
2277 -+ enum bfqq_expiration reason)
2278 -+{
2279 -+ struct request *next_rq;
2280 -+ unsigned long budget, min_budget;
2281 -+
2282 -+ budget = bfqq->max_budget;
2283 -+ min_budget = bfq_min_budget(bfqd);
2284 -+
2285 -+ BUG_ON(bfqq != bfqd->active_queue);
2286 -+
2287 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2288 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2289 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2290 -+ budget, bfq_min_budget(bfqd));
2291 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2292 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
2293 -+
2294 -+ if (bfq_bfqq_sync(bfqq)) {
2295 -+ switch (reason) {
2296 -+ /*
2297 -+ * Caveat: in all the following cases we trade latency
2298 -+ * for throughput.
2299 -+ */
2300 -+ case BFQ_BFQQ_TOO_IDLE:
2301 -+ /*
2302 -+ * This is the only case where we may reduce
2303 -+ * the budget: if there is no requets of the
2304 -+ * process still waiting for completion, then
2305 -+ * we assume (tentatively) that the timer has
2306 -+ * expired because the batch of requests of
2307 -+ * the process could have been served with a
2308 -+ * smaller budget. Hence, betting that
2309 -+ * process will behave in the same way when it
2310 -+ * becomes backlogged again, we reduce its
2311 -+ * next budget. As long as we guess right,
2312 -+ * this budget cut reduces the latency
2313 -+ * experienced by the process.
2314 -+ *
2315 -+ * However, if there are still outstanding
2316 -+ * requests, then the process may have not yet
2317 -+ * issued its next request just because it is
2318 -+ * still waiting for the completion of some of
2319 -+ * the still oustanding ones. So in this
2320 -+ * subcase we do not reduce its budget, on the
2321 -+ * contrary we increase it to possibly boost
2322 -+ * the throughput, as discussed in the
2323 -+ * comments to the BUDGET_TIMEOUT case.
2324 -+ */
2325 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
2326 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2327 -+ else {
2328 -+ if (budget > 5 * min_budget)
2329 -+ budget -= 4 * min_budget;
2330 -+ else
2331 -+ budget = min_budget;
2332 -+ }
2333 -+ break;
2334 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
2335 -+ /*
2336 -+ * We double the budget here because: 1) it
2337 -+ * gives the chance to boost the throughput if
2338 -+ * this is not a seeky process (which may have
2339 -+ * bumped into this timeout because of, e.g.,
2340 -+ * ZBR), 2) together with charge_full_budget
2341 -+ * it helps give seeky processes higher
2342 -+ * timestamps, and hence be served less
2343 -+ * frequently.
2344 -+ */
2345 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2346 -+ break;
2347 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2348 -+ /*
2349 -+ * The process still has backlog, and did not
2350 -+ * let either the budget timeout or the disk
2351 -+ * idling timeout expire. Hence it is not
2352 -+ * seeky, has a short thinktime and may be
2353 -+ * happy with a higher budget too. So
2354 -+ * definitely increase the budget of this good
2355 -+ * candidate to boost the disk throughput.
2356 -+ */
2357 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
2358 -+ break;
2359 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
2360 -+ /*
2361 -+ * Leave the budget unchanged.
2362 -+ */
2363 -+ default:
2364 -+ return;
2365 -+ }
2366 -+ } else /* async queue */
2367 -+ /* async queues get always the maximum possible budget
2368 -+ * (their ability to dispatch is limited by
2369 -+ * @bfqd->bfq_max_budget_async_rq).
2370 -+ */
2371 -+ budget = bfqd->bfq_max_budget;
2372 -+
2373 -+ bfqq->max_budget = budget;
2374 -+
2375 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2376 -+ bfqq->max_budget > bfqd->bfq_max_budget)
2377 -+ bfqq->max_budget = bfqd->bfq_max_budget;
2378 -+
2379 -+ /*
2380 -+ * Make sure that we have enough budget for the next request.
2381 -+ * Since the finish time of the bfqq must be kept in sync with
2382 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
2383 -+ * update.
2384 -+ */
2385 -+ next_rq = bfqq->next_rq;
2386 -+ if (next_rq != NULL)
2387 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2388 -+ bfq_serv_to_charge(next_rq, bfqq));
2389 -+ else
2390 -+ bfqq->entity.budget = bfqq->max_budget;
2391 -+
2392 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2393 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2394 -+ bfqq->entity.budget);
2395 -+}
2396 -+
2397 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2398 -+{
2399 -+ unsigned long max_budget;
2400 -+
2401 -+ /*
2402 -+ * The max_budget calculated when autotuning is equal to the
2403 -+ * amount of sectors transfered in timeout_sync at the
2404 -+ * estimated peak rate.
2405 -+ */
2406 -+ max_budget = (unsigned long)(peak_rate * 1000 *
2407 -+ timeout >> BFQ_RATE_SHIFT);
2408 -+
2409 -+ return max_budget;
2410 -+}
2411 -+
2412 -+/*
2413 -+ * In addition to updating the peak rate, checks whether the process
2414 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
2415 -+ * to the budget timeout, to reduce the amount of service provided to
2416 -+ * seeky processes, and hence reduce their chances to lower the
2417 -+ * throughput. See the code for more details.
2418 -+ */
2419 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2420 -+ int compensate, enum bfqq_expiration reason)
2421 -+{
2422 -+ u64 bw, usecs, expected, timeout;
2423 -+ ktime_t delta;
2424 -+ int update = 0;
2425 -+
2426 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2427 -+ return 0;
2428 -+
2429 -+ if (compensate)
2430 -+ delta = bfqd->last_idling_start;
2431 -+ else
2432 -+ delta = ktime_get();
2433 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
2434 -+ usecs = ktime_to_us(delta);
2435 -+
2436 -+ /* Don't trust short/unrealistic values. */
2437 -+ if (usecs < 100 || usecs >= LONG_MAX)
2438 -+ return 0;
2439 -+
2440 -+ /*
2441 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
2442 -+ * value to store the peak rate, in sectors per usec in fixed
2443 -+ * point math. We do so to have enough precision in the estimate
2444 -+ * and to avoid overflows.
2445 -+ */
2446 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2447 -+ do_div(bw, (unsigned long)usecs);
2448 -+
2449 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2450 -+
2451 -+ /*
2452 -+ * Use only long (> 20ms) intervals to filter out spikes for
2453 -+ * the peak rate estimation.
2454 -+ */
2455 -+ if (usecs > 20000) {
2456 -+ if (bw > bfqd->peak_rate ||
2457 -+ (!BFQQ_SEEKY(bfqq) &&
2458 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2459 -+ bfq_log(bfqd, "measured bw =%llu", bw);
2460 -+ /*
2461 -+ * To smooth oscillations use a low-pass filter with
2462 -+ * alpha=7/8, i.e.,
2463 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
2464 -+ */
2465 -+ do_div(bw, 8);
2466 -+ if (bw == 0)
2467 -+ return 0;
2468 -+ bfqd->peak_rate *= 7;
2469 -+ do_div(bfqd->peak_rate, 8);
2470 -+ bfqd->peak_rate += bw;
2471 -+ update = 1;
2472 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2473 -+ }
2474 -+
2475 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2476 -+
2477 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2478 -+ bfqd->peak_rate_samples++;
2479 -+
2480 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2481 -+ update && bfqd->bfq_user_max_budget == 0) {
2482 -+ bfqd->bfq_max_budget =
2483 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2484 -+ bfq_log(bfqd, "new max_budget=%lu",
2485 -+ bfqd->bfq_max_budget);
2486 -+ }
2487 -+ }
2488 -+
2489 -+ /*
2490 -+ * If the process has been served for a too short time
2491 -+ * interval to let its possible sequential accesses prevail on
2492 -+ * the initial seek time needed to move the disk head on the
2493 -+ * first sector it requested, then give the process a chance
2494 -+ * and for the moment return false.
2495 -+ */
2496 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2497 -+ return 0;
2498 -+
2499 -+ /*
2500 -+ * A process is considered ``slow'' (i.e., seeky, so that we
2501 -+ * cannot treat it fairly in the service domain, as it would
2502 -+ * slow down too much the other processes) if, when a slice
2503 -+ * ends for whatever reason, it has received service at a
2504 -+ * rate that would not be high enough to complete the budget
2505 -+ * before the budget timeout expiration.
2506 -+ */
2507 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2508 -+
2509 -+ /*
2510 -+ * Caveat: processes doing IO in the slower disk zones will
2511 -+ * tend to be slow(er) even if not seeky. And the estimated
2512 -+ * peak rate will actually be an average over the disk
2513 -+ * surface. Hence, to not be too harsh with unlucky processes,
2514 -+ * we keep a budget/3 margin of safety before declaring a
2515 -+ * process slow.
2516 -+ */
2517 -+ return expected > (4 * bfqq->entity.budget) / 3;
2518 -+}
2519 -+
2520 -+/**
2521 -+ * bfq_bfqq_expire - expire a queue.
2522 -+ * @bfqd: device owning the queue.
2523 -+ * @bfqq: the queue to expire.
2524 -+ * @compensate: if true, compensate for the time spent idling.
2525 -+ * @reason: the reason causing the expiration.
2526 -+ *
2527 -+ *
2528 -+ * If the process associated to the queue is slow (i.e., seeky), or in
2529 -+ * case of budget timeout, or, finally, if it is async, we
2530 -+ * artificially charge it an entire budget (independently of the
2531 -+ * actual service it received). As a consequence, the queue will get
2532 -+ * higher timestamps than the correct ones upon reactivation, and
2533 -+ * hence it will be rescheduled as if it had received more service
2534 -+ * than what it actually received. In the end, this class of processes
2535 -+ * will receive less service in proportion to how slowly they consume
2536 -+ * their budgets (and hence how seriously they tend to lower the
2537 -+ * throughput).
2538 -+ *
2539 -+ * In contrast, when a queue expires because it has been idling for
2540 -+ * too much or because it exhausted its budget, we do not touch the
2541 -+ * amount of service it has received. Hence when the queue will be
2542 -+ * reactivated and its timestamps updated, the latter will be in sync
2543 -+ * with the actual service received by the queue until expiration.
2544 -+ *
2545 -+ * Charging a full budget to the first type of queues and the exact
2546 -+ * service to the others has the effect of using the WF2Q+ policy to
2547 -+ * schedule the former on a timeslice basis, without violating the
2548 -+ * service domain guarantees of the latter.
2549 -+ */
2550 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
2551 -+ struct bfq_queue *bfqq,
2552 -+ int compensate,
2553 -+ enum bfqq_expiration reason)
2554 -+{
2555 -+ int slow;
2556 -+ BUG_ON(bfqq != bfqd->active_queue);
2557 -+
2558 -+ /* Update disk peak rate for autotuning and check whether the
2559 -+ * process is slow (see bfq_update_peak_rate).
2560 -+ */
2561 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2562 -+
2563 -+ /*
2564 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
2565 -+ * and async queues, to favor sequential sync workloads.
2566 -+ *
2567 -+ * Processes doing IO in the slower disk zones will tend to be
2568 -+ * slow(er) even if not seeky. Hence, since the estimated peak
2569 -+ * rate is actually an average over the disk surface, these
2570 -+ * processes may timeout just for bad luck. To avoid punishing
2571 -+ * them we do not charge a full budget to a process that
2572 -+ * succeeded in consuming at least 2/3 of its budget.
2573 -+ */
2574 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2575 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2576 -+ bfq_bfqq_charge_full_budget(bfqq);
2577 -+
2578 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2579 -+ bfqq->last_rais_start_finish = jiffies;
2580 -+
2581 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2582 -+ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
2583 -+ bfqq->soft_rt_next_start =
2584 -+ jiffies +
2585 -+ HZ * bfqq->entity.service /
2586 -+ bfqd->bfq_raising_max_softrt_rate;
2587 -+ else
2588 -+ bfqq->soft_rt_next_start = -1; /* infinity */
2589 -+ }
2590 -+ bfq_log_bfqq(bfqd, bfqq,
2591 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2592 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2593 -+
2594 -+ /* Increase, decrease or leave budget unchanged according to reason */
2595 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2596 -+ __bfq_bfqq_expire(bfqd, bfqq);
2597 -+}
2598 -+
2599 -+/*
2600 -+ * Budget timeout is not implemented through a dedicated timer, but
2601 -+ * just checked on request arrivals and completions, as well as on
2602 -+ * idle timer expirations.
2603 -+ */
2604 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2605 -+{
2606 -+ if (bfq_bfqq_budget_new(bfqq))
2607 -+ return 0;
2608 -+
2609 -+ if (time_before(jiffies, bfqq->budget_timeout))
2610 -+ return 0;
2611 -+
2612 -+ return 1;
2613 -+}
2614 -+
2615 -+/*
2616 -+ * If we expire a queue that is waiting for the arrival of a new
2617 -+ * request, we may prevent the fictitious timestamp backshifting that
2618 -+ * allows the guarantees of the queue to be preserved (see [1] for
2619 -+ * this tricky aspect). Hence we return true only if this condition
2620 -+ * does not hold, or if the queue is slow enough to deserve only to be
2621 -+ * kicked off for preserving a high throughput.
2622 -+*/
2623 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2624 -+{
2625 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
2626 -+ "may_budget_timeout: wr %d left %d timeout %d",
2627 -+ bfq_bfqq_wait_request(bfqq),
2628 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2629 -+ bfq_bfqq_budget_timeout(bfqq));
2630 -+
2631 -+ return (!bfq_bfqq_wait_request(bfqq) ||
2632 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2633 -+ &&
2634 -+ bfq_bfqq_budget_timeout(bfqq);
2635 -+}
2636 -+
2637 -+/*
2638 -+ * If the active queue is empty, but it is sync and either of the following
2639 -+ * conditions holds, then: 1) the queue must remain active and cannot be
2640 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
2641 -+ * of a new request for the queue. The conditions are:
2642 -+ * - the device is rotational and not performing NCQ, and the queue has its
2643 -+ * idle window set (in this case, waiting for a new request for the queue
2644 -+ * is likely to boost the disk throughput);
2645 -+ * - the queue is weight-raised (waiting for the request is necessary for
2646 -+ * providing the queue with fairness and latency guarantees).
2647 -+ */
2648 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
2649 -+ int budg_timeout)
2650 -+{
2651 -+ struct bfq_data *bfqd = bfqq->bfqd;
2652 -+
2653 -+ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
2654 -+ bfqd->bfq_slice_idle != 0 &&
2655 -+ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
2656 -+ !blk_queue_nonrot(bfqd->queue))
2657 -+ || bfqq->raising_coeff > 1) &&
2658 -+ (bfqd->rq_in_driver == 0 ||
2659 -+ budg_timeout ||
2660 -+ bfqq->raising_coeff > 1) &&
2661 -+ !bfq_close_cooperator(bfqd, bfqq) &&
2662 -+ (!bfq_bfqq_coop(bfqq) ||
2663 -+ !bfq_bfqq_some_coop_idle(bfqq)) &&
2664 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
2665 -+}
2666 -+
2667 -+/*
2668 -+ * Select a queue for service. If we have a current active queue,
2669 -+ * check whether to continue servicing it, or retrieve and set a new one.
2670 -+ */
2671 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2672 -+{
2673 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
2674 -+ struct request *next_rq;
2675 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2676 -+ int budg_timeout;
2677 -+
2678 -+ bfqq = bfqd->active_queue;
2679 -+ if (bfqq == NULL)
2680 -+ goto new_queue;
2681 -+
2682 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
2683 -+
2684 -+ /*
2685 -+ * If another queue has a request waiting within our mean seek
2686 -+ * distance, let it run. The expire code will check for close
2687 -+ * cooperators and put the close queue at the front of the
2688 -+ * service tree. If possible, merge the expiring queue with the
2689 -+ * new bfqq.
2690 -+ */
2691 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
2692 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
2693 -+ bfq_setup_merge(bfqq, new_bfqq);
2694 -+
2695 -+ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
2696 -+ if (budg_timeout &&
2697 -+ !bfq_bfqq_must_idle(bfqq, budg_timeout))
2698 -+ goto expire;
2699 -+
2700 -+ next_rq = bfqq->next_rq;
2701 -+ /*
2702 -+ * If bfqq has requests queued and it has enough budget left to
2703 -+ * serve them, keep the queue, otherwise expire it.
2704 -+ */
2705 -+ if (next_rq != NULL) {
2706 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
2707 -+ bfq_bfqq_budget_left(bfqq)) {
2708 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2709 -+ goto expire;
2710 -+ } else {
2711 -+ /*
2712 -+ * The idle timer may be pending because we may not
2713 -+ * disable disk idling even when a new request arrives
2714 -+ */
2715 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
2716 -+ /*
2717 -+ * If we get here: 1) at least a new request
2718 -+ * has arrived but we have not disabled the
2719 -+ * timer because the request was too small,
2720 -+ * 2) then the block layer has unplugged the
2721 -+ * device, causing the dispatch to be invoked.
2722 -+ *
2723 -+ * Since the device is unplugged, now the
2724 -+ * requests are probably large enough to
2725 -+ * provide a reasonable throughput.
2726 -+ * So we disable idling.
2727 -+ */
2728 -+ bfq_clear_bfqq_wait_request(bfqq);
2729 -+ del_timer(&bfqd->idle_slice_timer);
2730 -+ }
2731 -+ if (new_bfqq == NULL)
2732 -+ goto keep_queue;
2733 -+ else
2734 -+ goto expire;
2735 -+ }
2736 -+ }
2737 -+
2738 -+ /*
2739 -+ * No requests pending. If there is no cooperator, and the active
2740 -+ * queue still has requests in flight or is idling for a new request,
2741 -+ * then keep it.
2742 -+ */
2743 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
2744 -+ (bfqq->dispatched != 0 &&
2745 -+ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
2746 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
2747 -+ bfqq = NULL;
2748 -+ goto keep_queue;
2749 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
2750 -+ /*
2751 -+ * Expiring the queue because there is a close cooperator,
2752 -+ * cancel timer.
2753 -+ */
2754 -+ bfq_clear_bfqq_wait_request(bfqq);
2755 -+ del_timer(&bfqd->idle_slice_timer);
2756 -+ }
2757 -+
2758 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2759 -+expire:
2760 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2761 -+new_queue:
2762 -+ bfqq = bfq_set_active_queue(bfqd, new_bfqq);
2763 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
2764 -+ bfqq != NULL ? bfqq->pid : 0);
2765 -+keep_queue:
2766 -+ return bfqq;
2767 -+}
2768 -+
2769 -+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2770 -+{
2771 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
2772 -+ struct bfq_entity *entity = &bfqq->entity;
2773 -+
2774 -+ bfq_log_bfqq(bfqd, bfqq,
2775 -+ "raising period dur %u/%u msec, "
2776 -+ "old raising coeff %u, w %d(%d)",
2777 -+ jiffies_to_msecs(jiffies -
2778 -+ bfqq->last_rais_start_finish),
2779 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
2780 -+ bfqq->raising_coeff,
2781 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
2782 -+
2783 -+ BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
2784 -+ entity->orig_weight * bfqq->raising_coeff);
2785 -+ if(entity->ioprio_changed)
2786 -+ bfq_log_bfqq(bfqd, bfqq,
2787 -+ "WARN: pending prio change");
2788 -+ /*
2789 -+ * If too much time has elapsed from the beginning
2790 -+ * of this weight-raising period and process is not soft
2791 -+ * real-time, stop it
2792 -+ */
2793 -+ if (jiffies - bfqq->last_rais_start_finish >
2794 -+ bfqq->raising_cur_max_time) {
2795 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
2796 -+ bfqq->soft_rt_next_start < jiffies;
2797 -+
2798 -+ bfqq->last_rais_start_finish = jiffies;
2799 -+ if (soft_rt)
2800 -+ bfqq->raising_cur_max_time =
2801 -+ bfqd->bfq_raising_rt_max_time;
2802 -+ else {
2803 -+ bfq_log_bfqq(bfqd, bfqq,
2804 -+ "wrais ending at %llu msec,"
2805 -+ "rais_max_time %u",
2806 -+ bfqq->last_rais_start_finish,
2807 -+ jiffies_to_msecs(bfqq->
2808 -+ raising_cur_max_time));
2809 -+ bfq_bfqq_end_raising(bfqq);
2810 -+ __bfq_entity_update_weight_prio(
2811 -+ bfq_entity_service_tree(entity),
2812 -+ entity);
2813 -+ }
2814 -+ }
2815 -+ }
2816 -+}
2817 -+
2818 -+/*
2819 -+ * Dispatch one request from bfqq, moving it to the request queue
2820 -+ * dispatch list.
2821 -+ */
2822 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
2823 -+ struct bfq_queue *bfqq)
2824 -+{
2825 -+ int dispatched = 0;
2826 -+ struct request *rq;
2827 -+ unsigned long service_to_charge;
2828 -+
2829 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2830 -+
2831 -+ /* Follow expired path, else get first next available. */
2832 -+ rq = bfq_check_fifo(bfqq);
2833 -+ if (rq == NULL)
2834 -+ rq = bfqq->next_rq;
2835 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
2836 -+
2837 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2838 -+ /*
2839 -+ * This may happen if the next rq is chosen
2840 -+ * in fifo order instead of sector order.
2841 -+ * The budget is properly dimensioned
2842 -+ * to be always sufficient to serve the next request
2843 -+ * only if it is chosen in sector order. The reason is
2844 -+ * that it would be quite inefficient and little useful
2845 -+ * to always make sure that the budget is large enough
2846 -+ * to serve even the possible next rq in fifo order.
2847 -+ * In fact, requests are seldom served in fifo order.
2848 -+ *
2849 -+ * Expire the queue for budget exhaustion, and
2850 -+ * make sure that the next act_budget is enough
2851 -+ * to serve the next request, even if it comes
2852 -+ * from the fifo expired path.
2853 -+ */
2854 -+ bfqq->next_rq = rq;
2855 -+ /*
2856 -+ * Since this dispatch is failed, make sure that
2857 -+ * a new one will be performed
2858 -+ */
2859 -+ if (!bfqd->rq_in_driver)
2860 -+ bfq_schedule_dispatch(bfqd);
2861 -+ goto expire;
2862 -+ }
2863 -+
2864 -+ /* Finally, insert request into driver dispatch list. */
2865 -+ bfq_bfqq_served(bfqq, service_to_charge);
2866 -+ bfq_dispatch_insert(bfqd->queue, rq);
2867 -+
2868 -+ update_raising_data(bfqd, bfqq);
2869 -+
2870 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
2871 -+ "budg left %lu",
2872 -+ blk_rq_sectors(rq),
2873 -+ (long long unsigned)blk_rq_pos(rq),
2874 -+ bfq_bfqq_budget_left(bfqq));
2875 -+
2876 -+ dispatched++;
2877 -+
2878 -+ if (bfqd->active_bic == NULL) {
2879 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
2880 -+ bfqd->active_bic = RQ_BIC(rq);
2881 -+ }
2882 -+
2883 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2884 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
2885 -+ bfq_class_idle(bfqq)))
2886 -+ goto expire;
2887 -+
2888 -+ return dispatched;
2889 -+
2890 -+expire:
2891 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2892 -+ return dispatched;
2893 -+}
2894 -+
2895 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2896 -+{
2897 -+ int dispatched = 0;
2898 -+
2899 -+ while (bfqq->next_rq != NULL) {
2900 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2901 -+ dispatched++;
2902 -+ }
2903 -+
2904 -+ BUG_ON(!list_empty(&bfqq->fifo));
2905 -+ return dispatched;
2906 -+}
2907 -+
2908 -+/*
2909 -+ * Drain our current requests. Used for barriers and when switching
2910 -+ * io schedulers on-the-fly.
2911 -+ */
2912 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
2913 -+{
2914 -+ struct bfq_queue *bfqq, *n;
2915 -+ struct bfq_service_tree *st;
2916 -+ int dispatched = 0;
2917 -+
2918 -+ bfqq = bfqd->active_queue;
2919 -+ if (bfqq != NULL)
2920 -+ __bfq_bfqq_expire(bfqd, bfqq);
2921 -+
2922 -+ /*
2923 -+ * Loop through classes, and be careful to leave the scheduler
2924 -+ * in a consistent state, as feedback mechanisms and vtime
2925 -+ * updates cannot be disabled during the process.
2926 -+ */
2927 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
2928 -+ st = bfq_entity_service_tree(&bfqq->entity);
2929 -+
2930 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
2931 -+ bfqq->max_budget = bfq_max_budget(bfqd);
2932 -+
2933 -+ bfq_forget_idle(st);
2934 -+ }
2935 -+
2936 -+ BUG_ON(bfqd->busy_queues != 0);
2937 -+
2938 -+ return dispatched;
2939 -+}
2940 -+
2941 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
2942 -+{
2943 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2944 -+ struct bfq_queue *bfqq;
2945 -+ int max_dispatch;
2946 -+
2947 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
2948 -+ if (bfqd->busy_queues == 0)
2949 -+ return 0;
2950 -+
2951 -+ if (unlikely(force))
2952 -+ return bfq_forced_dispatch(bfqd);
2953 -+
2954 -+ if((bfqq = bfq_select_queue(bfqd)) == NULL)
2955 -+ return 0;
2956 -+
2957 -+ max_dispatch = bfqd->bfq_quantum;
2958 -+ if (bfq_class_idle(bfqq))
2959 -+ max_dispatch = 1;
2960 -+
2961 -+ if (!bfq_bfqq_sync(bfqq))
2962 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
2963 -+
2964 -+ if (bfqq->dispatched >= max_dispatch) {
2965 -+ if (bfqd->busy_queues > 1)
2966 -+ return 0;
2967 -+ if (bfqq->dispatched >= 4 * max_dispatch)
2968 -+ return 0;
2969 -+ }
2970 -+
2971 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
2972 -+ return 0;
2973 -+
2974 -+ bfq_clear_bfqq_wait_request(bfqq);
2975 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2976 -+
2977 -+ if (! bfq_dispatch_request(bfqd, bfqq))
2978 -+ return 0;
2979 -+
2980 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
2981 -+ "(max_disp %d)", bfqq->pid, max_dispatch);
2982 -+
2983 -+ return 1;
2984 -+}
2985 -+
2986 -+/*
2987 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
2988 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
2989 -+ *
2990 -+ * Queue lock must be held here.
2991 -+ */
2992 -+static void bfq_put_queue(struct bfq_queue *bfqq)
2993 -+{
2994 -+ struct bfq_data *bfqd = bfqq->bfqd;
2995 -+
2996 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
2997 -+
2998 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
2999 -+ atomic_read(&bfqq->ref));
3000 -+ if (!atomic_dec_and_test(&bfqq->ref))
3001 -+ return;
3002 -+
3003 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3004 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3005 -+ BUG_ON(bfqq->entity.tree != NULL);
3006 -+ BUG_ON(bfq_bfqq_busy(bfqq));
3007 -+ BUG_ON(bfqd->active_queue == bfqq);
3008 -+
3009 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3010 -+
3011 -+ kmem_cache_free(bfq_pool, bfqq);
3012 -+}
3013 -+
3014 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
3015 -+{
3016 -+ struct bfq_queue *__bfqq, *next;
3017 -+
3018 -+ /*
3019 -+ * If this queue was scheduled to merge with another queue, be
3020 -+ * sure to drop the reference taken on that queue (and others in
3021 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3022 -+ */
3023 -+ __bfqq = bfqq->new_bfqq;
3024 -+ while (__bfqq) {
3025 -+ if (__bfqq == bfqq) {
3026 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
3027 -+ break;
3028 -+ }
3029 -+ next = __bfqq->new_bfqq;
3030 -+ bfq_put_queue(__bfqq);
3031 -+ __bfqq = next;
3032 -+ }
3033 -+}
3034 -+
3035 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3036 -+{
3037 -+ if (bfqq == bfqd->active_queue) {
3038 -+ __bfq_bfqq_expire(bfqd, bfqq);
3039 -+ bfq_schedule_dispatch(bfqd);
3040 -+ }
3041 -+
3042 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3043 -+ atomic_read(&bfqq->ref));
3044 -+
3045 -+ bfq_put_cooperator(bfqq);
3046 -+
3047 -+ bfq_put_queue(bfqq);
3048 -+}
3049 -+
3050 -+static void bfq_init_icq(struct io_cq *icq)
3051 -+{
3052 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3053 -+
3054 -+ bic->ttime.last_end_request = jiffies;
3055 -+}
3056 -+
3057 -+static void bfq_exit_icq(struct io_cq *icq)
3058 -+{
3059 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3060 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
3061 -+
3062 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
3063 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3064 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
3065 -+ }
3066 -+
3067 -+ if (bic->bfqq[BLK_RW_SYNC]) {
3068 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3069 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
3070 -+ }
3071 -+}
3072 -+
3073 -+/*
3074 -+ * Update the entity prio values; note that the new values will not
3075 -+ * be used until the next (re)activation.
3076 -+ */
3077 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3078 -+{
3079 -+ struct task_struct *tsk = current;
3080 -+ int ioprio_class;
3081 -+
3082 -+ if (!bfq_bfqq_prio_changed(bfqq))
3083 -+ return;
3084 -+
3085 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3086 -+ switch (ioprio_class) {
3087 -+ default:
3088 -+ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
3089 -+ case IOPRIO_CLASS_NONE:
3090 -+ /*
3091 -+ * No prio set, inherit CPU scheduling settings.
3092 -+ */
3093 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3094 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3095 -+ break;
3096 -+ case IOPRIO_CLASS_RT:
3097 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3098 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3099 -+ break;
3100 -+ case IOPRIO_CLASS_BE:
3101 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3102 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3103 -+ break;
3104 -+ case IOPRIO_CLASS_IDLE:
3105 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3106 -+ bfqq->entity.new_ioprio = 7;
3107 -+ bfq_clear_bfqq_idle_window(bfqq);
3108 -+ break;
3109 -+ }
3110 -+
3111 -+ bfqq->entity.ioprio_changed = 1;
3112 -+
3113 -+ /*
3114 -+ * Keep track of original prio settings in case we have to temporarily
3115 -+ * elevate the priority of this queue.
3116 -+ */
3117 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3118 -+ bfq_clear_bfqq_prio_changed(bfqq);
3119 -+}
3120 -+
3121 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3122 -+{
3123 -+ struct bfq_data *bfqd;
3124 -+ struct bfq_queue *bfqq, *new_bfqq;
3125 -+ struct bfq_group *bfqg;
3126 -+ unsigned long uninitialized_var(flags);
3127 -+ int ioprio = bic->icq.ioc->ioprio;
3128 -+
3129 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
3130 -+ /*
3131 -+ * This condition may trigger on a newly created bic, be sure to drop the
3132 -+ * lock before returning.
3133 -+ */
3134 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3135 -+ goto out;
3136 -+
3137 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
3138 -+ if (bfqq != NULL) {
3139 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3140 -+ sched_data);
3141 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3142 -+ GFP_ATOMIC);
3143 -+ if (new_bfqq != NULL) {
3144 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3145 -+ bfq_log_bfqq(bfqd, bfqq,
3146 -+ "changed_ioprio: bfqq %p %d",
3147 -+ bfqq, atomic_read(&bfqq->ref));
3148 -+ bfq_put_queue(bfqq);
3149 -+ }
3150 -+ }
3151 -+
3152 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
3153 -+ if (bfqq != NULL)
3154 -+ bfq_mark_bfqq_prio_changed(bfqq);
3155 -+
3156 -+ bic->ioprio = ioprio;
3157 -+
3158 -+out:
3159 -+ bfq_put_bfqd_unlock(bfqd, &flags);
3160 -+}
3161 -+
3162 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3163 -+ pid_t pid, int is_sync)
3164 -+{
3165 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3166 -+ INIT_LIST_HEAD(&bfqq->fifo);
3167 -+
3168 -+ atomic_set(&bfqq->ref, 0);
3169 -+ bfqq->bfqd = bfqd;
3170 -+
3171 -+ bfq_mark_bfqq_prio_changed(bfqq);
3172 -+
3173 -+ if (is_sync) {
3174 -+ if (!bfq_class_idle(bfqq))
3175 -+ bfq_mark_bfqq_idle_window(bfqq);
3176 -+ bfq_mark_bfqq_sync(bfqq);
3177 -+ }
3178 -+
3179 -+ /* Tentative initial value to trade off between thr and lat */
3180 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3181 -+ bfqq->pid = pid;
3182 -+
3183 -+ bfqq->raising_coeff = 1;
3184 -+ bfqq->last_rais_start_finish = 0;
3185 -+ bfqq->soft_rt_next_start = -1;
3186 -+}
3187 -+
3188 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3189 -+ struct bfq_group *bfqg,
3190 -+ int is_sync,
3191 -+ struct bfq_io_cq *bic,
3192 -+ gfp_t gfp_mask)
3193 -+{
3194 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3195 -+
3196 -+retry:
3197 -+ /* bic always exists here */
3198 -+ bfqq = bic_to_bfqq(bic, is_sync);
3199 -+
3200 -+ /*
3201 -+ * Always try a new alloc if we fall back to the OOM bfqq
3202 -+ * originally, since it should just be a temporary situation.
3203 -+ */
3204 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3205 -+ bfqq = NULL;
3206 -+ if (new_bfqq != NULL) {
3207 -+ bfqq = new_bfqq;
3208 -+ new_bfqq = NULL;
3209 -+ } else if (gfp_mask & __GFP_WAIT) {
3210 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3211 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3212 -+ gfp_mask | __GFP_ZERO,
3213 -+ bfqd->queue->node);
3214 -+ spin_lock_irq(bfqd->queue->queue_lock);
3215 -+ if (new_bfqq != NULL)
3216 -+ goto retry;
3217 -+ } else {
3218 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
3219 -+ gfp_mask | __GFP_ZERO,
3220 -+ bfqd->queue->node);
3221 -+ }
3222 -+
3223 -+ if (bfqq != NULL) {
3224 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3225 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
3226 -+ } else {
3227 -+ bfqq = &bfqd->oom_bfqq;
3228 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3229 -+ }
3230 -+
3231 -+ bfq_init_prio_data(bfqq, bic);
3232 -+ bfq_init_entity(&bfqq->entity, bfqg);
3233 -+ }
3234 -+
3235 -+ if (new_bfqq != NULL)
3236 -+ kmem_cache_free(bfq_pool, new_bfqq);
3237 -+
3238 -+ return bfqq;
3239 -+}
3240 -+
3241 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3242 -+ struct bfq_group *bfqg,
3243 -+ int ioprio_class, int ioprio)
3244 -+{
3245 -+ switch (ioprio_class) {
3246 -+ case IOPRIO_CLASS_RT:
3247 -+ return &bfqg->async_bfqq[0][ioprio];
3248 -+ case IOPRIO_CLASS_NONE:
3249 -+ ioprio = IOPRIO_NORM;
3250 -+ /* fall through */
3251 -+ case IOPRIO_CLASS_BE:
3252 -+ return &bfqg->async_bfqq[1][ioprio];
3253 -+ case IOPRIO_CLASS_IDLE:
3254 -+ return &bfqg->async_idle_bfqq;
3255 -+ default:
3256 -+ BUG();
3257 -+ }
3258 -+}
3259 -+
3260 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3261 -+ struct bfq_group *bfqg, int is_sync,
3262 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
3263 -+{
3264 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3265 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3266 -+ struct bfq_queue **async_bfqq = NULL;
3267 -+ struct bfq_queue *bfqq = NULL;
3268 -+
3269 -+ if (!is_sync) {
3270 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3271 -+ ioprio);
3272 -+ bfqq = *async_bfqq;
3273 -+ }
3274 -+
3275 -+ if (bfqq == NULL)
3276 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3277 -+
3278 -+ /*
3279 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
3280 -+ */
3281 -+ if (!is_sync && *async_bfqq == NULL) {
3282 -+ atomic_inc(&bfqq->ref);
3283 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3284 -+ bfqq, atomic_read(&bfqq->ref));
3285 -+ *async_bfqq = bfqq;
3286 -+ }
3287 -+
3288 -+ atomic_inc(&bfqq->ref);
3289 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3290 -+ atomic_read(&bfqq->ref));
3291 -+ return bfqq;
3292 -+}
3293 -+
3294 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3295 -+ struct bfq_io_cq *bic)
3296 -+{
3297 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3298 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3299 -+
3300 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3301 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3302 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
3303 -+}
3304 -+
3305 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
3306 -+ struct bfq_queue *bfqq,
3307 -+ struct request *rq)
3308 -+{
3309 -+ sector_t sdist;
3310 -+ u64 total;
3311 -+
3312 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
3313 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3314 -+ else
3315 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3316 -+
3317 -+ /*
3318 -+ * Don't allow the seek distance to get too large from the
3319 -+ * odd fragment, pagein, etc.
3320 -+ */
3321 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3322 -+ sdist = 0;
3323 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
3324 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3325 -+ else
3326 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3327 -+
3328 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3329 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3330 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
3331 -+ do_div(total, bfqq->seek_samples);
3332 -+ if (bfq_bfqq_coop(bfqq)) {
3333 -+ /*
3334 -+ * If the mean seektime increases for a (non-seeky) shared
3335 -+ * queue, some cooperator is likely to be idling too much.
3336 -+ * On the contrary, if it decreases, some cooperator has
3337 -+ * probably waked up.
3338 -+ *
3339 -+ */
3340 -+ if ((sector_t)total < bfqq->seek_mean)
3341 -+ bfq_mark_bfqq_some_coop_idle(bfqq) ;
3342 -+ else if ((sector_t)total > bfqq->seek_mean)
3343 -+ bfq_clear_bfqq_some_coop_idle(bfqq) ;
3344 -+ }
3345 -+ bfqq->seek_mean = (sector_t)total;
3346 -+
3347 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3348 -+ (u64)bfqq->seek_mean);
3349 -+}
3350 -+
3351 -+/*
3352 -+ * Disable idle window if the process thinks too long or seeks so much that
3353 -+ * it doesn't matter.
3354 -+ */
3355 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
3356 -+ struct bfq_queue *bfqq,
3357 -+ struct bfq_io_cq *bic)
3358 -+{
3359 -+ int enable_idle;
3360 -+
3361 -+ /* Don't idle for async or idle io prio class. */
3362 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3363 -+ return;
3364 -+
3365 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
3366 -+
3367 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3368 -+ bfqd->bfq_slice_idle == 0 ||
3369 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3370 -+ bfqq->raising_coeff == 1))
3371 -+ enable_idle = 0;
3372 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3373 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3374 -+ bfqq->raising_coeff == 1)
3375 -+ enable_idle = 0;
3376 -+ else
3377 -+ enable_idle = 1;
3378 -+ }
3379 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3380 -+ enable_idle);
3381 -+
3382 -+ if (enable_idle)
3383 -+ bfq_mark_bfqq_idle_window(bfqq);
3384 -+ else
3385 -+ bfq_clear_bfqq_idle_window(bfqq);
3386 -+}
3387 -+
3388 -+/*
3389 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
3390 -+ * something we should do about it.
3391 -+ */
3392 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3393 -+ struct request *rq)
3394 -+{
3395 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
3396 -+
3397 -+ if (rq->cmd_flags & REQ_META)
3398 -+ bfqq->meta_pending++;
3399 -+
3400 -+ bfq_update_io_thinktime(bfqd, bic);
3401 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
3402 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3403 -+ !BFQQ_SEEKY(bfqq))
3404 -+ bfq_update_idle_window(bfqd, bfqq, bic);
3405 -+
3406 -+ bfq_log_bfqq(bfqd, bfqq,
3407 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3408 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3409 -+ (long long unsigned)bfqq->seek_mean);
3410 -+
3411 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3412 -+
3413 -+ if (bfqq == bfqd->active_queue) {
3414 -+ /*
3415 -+ * If there is just this request queued and the request
3416 -+ * is small, just exit.
3417 -+ * In this way, if the disk is being idled to wait for a new
3418 -+ * request from the active queue, we avoid unplugging the
3419 -+ * device now.
3420 -+ *
3421 -+ * By doing so, we spare the disk to be committed
3422 -+ * to serve just a small request. On the contrary, we wait for
3423 -+ * the block layer to decide when to unplug the device:
3424 -+ * hopefully, new requests will be merged to this
3425 -+ * one quickly, then the device will be unplugged
3426 -+ * and larger requests will be dispatched.
3427 -+ */
3428 -+ if (bfqq->queued[rq_is_sync(rq)] == 1 &&
3429 -+ blk_rq_sectors(rq) < 32) {
3430 -+ return;
3431 -+ }
3432 -+ if (bfq_bfqq_wait_request(bfqq)) {
3433 -+ /*
3434 -+ * If we are waiting for a request for this queue, let
3435 -+ * it rip immediately and flag that we must not expire
3436 -+ * this queue just now.
3437 -+ */
3438 -+ bfq_clear_bfqq_wait_request(bfqq);
3439 -+ del_timer(&bfqd->idle_slice_timer);
3440 -+ /*
3441 -+ * Here we can safely expire the queue, in
3442 -+ * case of budget timeout, without wasting
3443 -+ * guarantees
3444 -+ */
3445 -+ if (bfq_bfqq_budget_timeout(bfqq))
3446 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
3447 -+ BFQ_BFQQ_BUDGET_TIMEOUT);
3448 -+ __blk_run_queue(bfqd->queue);
3449 -+ }
3450 -+ }
3451 -+}
3452 -+
3453 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
3454 -+{
3455 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3456 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3457 -+
3458 -+ assert_spin_locked(bfqd->queue->queue_lock);
3459 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3460 -+
3461 -+ bfq_add_rq_rb(rq);
3462 -+
3463 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3464 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
3465 -+
3466 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
3467 -+}
3468 -+
3469 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
3470 -+{
3471 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3472 -+ bfqd->rq_in_driver);
3473 -+
3474 -+ if (bfqd->hw_tag == 1)
3475 -+ return;
3476 -+
3477 -+ /*
3478 -+ * This sample is valid if the number of outstanding requests
3479 -+ * is large enough to allow a queueing behavior. Note that the
3480 -+ * sum is not exact, as it's not taking into account deactivated
3481 -+ * requests.
3482 -+ */
3483 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3484 -+ return;
3485 -+
3486 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3487 -+ return;
3488 -+
3489 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3490 -+ bfqd->max_rq_in_driver = 0;
3491 -+ bfqd->hw_tag_samples = 0;
3492 -+}
3493 -+
3494 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
3495 -+{
3496 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3497 -+ struct bfq_data *bfqd = bfqq->bfqd;
3498 -+ const int sync = rq_is_sync(rq);
3499 -+
3500 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3501 -+ blk_rq_sectors(rq), sync);
3502 -+
3503 -+ bfq_update_hw_tag(bfqd);
3504 -+
3505 -+ WARN_ON(!bfqd->rq_in_driver);
3506 -+ WARN_ON(!bfqq->dispatched);
3507 -+ bfqd->rq_in_driver--;
3508 -+ bfqq->dispatched--;
3509 -+
3510 -+ if (bfq_bfqq_sync(bfqq))
3511 -+ bfqd->sync_flight--;
3512 -+
3513 -+ if (sync)
3514 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3515 -+
3516 -+ /*
3517 -+ * If this is the active queue, check if it needs to be expired,
3518 -+ * or if we want to idle in case it has no pending requests.
3519 -+ */
3520 -+ if (bfqd->active_queue == bfqq) {
3521 -+ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
3522 -+ if (bfq_bfqq_budget_new(bfqq))
3523 -+ bfq_set_budget_timeout(bfqd);
3524 -+
3525 -+ /* Idling is disabled also for cooperation issues:
3526 -+ * 1) there is a close cooperator for the queue, or
3527 -+ * 2) the queue is shared and some cooperator is likely
3528 -+ * to be idle (in this case, by not arming the idle timer,
3529 -+ * we try to slow down the queue, to prevent the zones
3530 -+ * of the disk accessed by the active cooperators to become
3531 -+ * too distant from the zone that will be accessed by the
3532 -+ * currently idle cooperators)
3533 -+ */
3534 -+ if (bfq_bfqq_must_idle(bfqq, budg_timeout))
3535 -+ bfq_arm_slice_timer(bfqd);
3536 -+ else if (budg_timeout)
3537 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3538 -+ }
3539 -+
3540 -+ if (!bfqd->rq_in_driver)
3541 -+ bfq_schedule_dispatch(bfqd);
3542 -+}
3543 -+
3544 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3545 -+{
3546 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3547 -+ bfq_clear_bfqq_must_alloc(bfqq);
3548 -+ return ELV_MQUEUE_MUST;
3549 -+ }
3550 -+
3551 -+ return ELV_MQUEUE_MAY;
3552 -+}
3553 -+
3554 -+static int bfq_may_queue(struct request_queue *q, int rw)
3555 -+{
3556 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3557 -+ struct task_struct *tsk = current;
3558 -+ struct bfq_io_cq *bic;
3559 -+ struct bfq_queue *bfqq;
3560 -+
3561 -+ /*
3562 -+ * Don't force setup of a queue from here, as a call to may_queue
3563 -+ * does not necessarily imply that a request actually will be queued.
3564 -+ * So just lookup a possibly existing queue, or return 'may queue'
3565 -+ * if that fails.
3566 -+ */
3567 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3568 -+ if (bic == NULL)
3569 -+ return ELV_MQUEUE_MAY;
3570 -+
3571 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3572 -+ if (bfqq != NULL) {
3573 -+ bfq_init_prio_data(bfqq, bic);
3574 -+
3575 -+ return __bfq_may_queue(bfqq);
3576 -+ }
3577 -+
3578 -+ return ELV_MQUEUE_MAY;
3579 -+}
3580 -+
3581 -+/*
3582 -+ * Queue lock held here.
3583 -+ */
3584 -+static void bfq_put_request(struct request *rq)
3585 -+{
3586 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3587 -+
3588 -+ if (bfqq != NULL) {
3589 -+ const int rw = rq_data_dir(rq);
3590 -+
3591 -+ BUG_ON(!bfqq->allocated[rw]);
3592 -+ bfqq->allocated[rw]--;
3593 -+
3594 -+ rq->elv.priv[0] = NULL;
3595 -+ rq->elv.priv[1] = NULL;
3596 -+
3597 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3598 -+ bfqq, atomic_read(&bfqq->ref));
3599 -+ bfq_put_queue(bfqq);
3600 -+ }
3601 -+}
3602 -+
3603 -+static struct bfq_queue *
3604 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3605 -+ struct bfq_queue *bfqq)
3606 -+{
3607 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3608 -+ (long unsigned)bfqq->new_bfqq->pid);
3609 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3610 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3611 -+ bfq_put_queue(bfqq);
3612 -+ return bic_to_bfqq(bic, 1);
3613 -+}
3614 -+
3615 -+/*
3616 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3617 -+ * was the last process referring to said bfqq.
3618 -+ */
3619 -+static struct bfq_queue *
3620 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3621 -+{
3622 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3623 -+ if (bfqq_process_refs(bfqq) == 1) {
3624 -+ bfqq->pid = current->pid;
3625 -+ bfq_clear_bfqq_some_coop_idle(bfqq);
3626 -+ bfq_clear_bfqq_coop(bfqq);
3627 -+ bfq_clear_bfqq_split_coop(bfqq);
3628 -+ return bfqq;
3629 -+ }
3630 -+
3631 -+ bic_set_bfqq(bic, NULL, 1);
3632 -+
3633 -+ bfq_put_cooperator(bfqq);
3634 -+
3635 -+ bfq_put_queue(bfqq);
3636 -+ return NULL;
3637 -+}
3638 -+
3639 -+/*
3640 -+ * Allocate bfq data structures associated with this request.
3641 -+ */
3642 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
3643 -+ struct bio *bio, gfp_t gfp_mask)
3644 -+{
3645 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3646 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3647 -+ const int rw = rq_data_dir(rq);
3648 -+ const int is_sync = rq_is_sync(rq);
3649 -+ struct bfq_queue *bfqq;
3650 -+ struct bfq_group *bfqg;
3651 -+ unsigned long flags;
3652 -+
3653 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
3654 -+
3655 -+ bfq_changed_ioprio(bic);
3656 -+
3657 -+ spin_lock_irqsave(q->queue_lock, flags);
3658 -+
3659 -+ if (bic == NULL)
3660 -+ goto queue_fail;
3661 -+
3662 -+ bfqg = bfq_bic_update_cgroup(bic);
3663 -+
3664 -+new_queue:
3665 -+ bfqq = bic_to_bfqq(bic, is_sync);
3666 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3667 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3668 -+ bic_set_bfqq(bic, bfqq, is_sync);
3669 -+ } else {
3670 -+ /*
3671 -+ * If the queue was seeky for too long, break it apart.
3672 -+ */
3673 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3674 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3675 -+ bfqq = bfq_split_bfqq(bic, bfqq);
3676 -+ if (!bfqq)
3677 -+ goto new_queue;
3678 -+ }
3679 -+
3680 -+ /*
3681 -+ * Check to see if this queue is scheduled to merge with
3682 -+ * another closely cooperating queue. The merging of queues
3683 -+ * happens here as it must be done in process context.
3684 -+ * The reference on new_bfqq was taken in merge_bfqqs.
3685 -+ */
3686 -+ if (bfqq->new_bfqq != NULL)
3687 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
3688 -+ }
3689 -+
3690 -+ bfqq->allocated[rw]++;
3691 -+ atomic_inc(&bfqq->ref);
3692 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3693 -+ atomic_read(&bfqq->ref));
3694 -+
3695 -+ rq->elv.priv[0] = bic;
3696 -+ rq->elv.priv[1] = bfqq;
3697 -+
3698 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3699 -+
3700 -+ return 0;
3701 -+
3702 -+queue_fail:
3703 -+ bfq_schedule_dispatch(bfqd);
3704 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3705 -+
3706 -+ return 1;
3707 -+}
3708 -+
3709 -+static void bfq_kick_queue(struct work_struct *work)
3710 -+{
3711 -+ struct bfq_data *bfqd =
3712 -+ container_of(work, struct bfq_data, unplug_work);
3713 -+ struct request_queue *q = bfqd->queue;
3714 -+
3715 -+ spin_lock_irq(q->queue_lock);
3716 -+ __blk_run_queue(q);
3717 -+ spin_unlock_irq(q->queue_lock);
3718 -+}
3719 -+
3720 -+/*
3721 -+ * Handler of the expiration of the timer running if the active_queue
3722 -+ * is idling inside its time slice.
3723 -+ */
3724 -+static void bfq_idle_slice_timer(unsigned long data)
3725 -+{
3726 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
3727 -+ struct bfq_queue *bfqq;
3728 -+ unsigned long flags;
3729 -+ enum bfqq_expiration reason;
3730 -+
3731 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3732 -+
3733 -+ bfqq = bfqd->active_queue;
3734 -+ /*
3735 -+ * Theoretical race here: active_queue can be NULL or different
3736 -+ * from the queue that was idling if the timer handler spins on
3737 -+ * the queue_lock and a new request arrives for the current
3738 -+ * queue and there is a full dispatch cycle that changes the
3739 -+ * active_queue. This can hardly happen, but in the worst case
3740 -+ * we just expire a queue too early.
3741 -+ */
3742 -+ if (bfqq != NULL) {
3743 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3744 -+ if (bfq_bfqq_budget_timeout(bfqq))
3745 -+ /*
3746 -+ * Also here the queue can be safely expired
3747 -+ * for budget timeout without wasting
3748 -+ * guarantees
3749 -+ */
3750 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3751 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3752 -+ /*
3753 -+ * The queue may not be empty upon timer expiration,
3754 -+ * because we may not disable the timer when the first
3755 -+ * request of the active queue arrives during
3756 -+ * disk idling
3757 -+ */
3758 -+ reason = BFQ_BFQQ_TOO_IDLE;
3759 -+ else
3760 -+ goto schedule_dispatch;
3761 -+
3762 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3763 -+ }
3764 -+
3765 -+schedule_dispatch:
3766 -+ bfq_schedule_dispatch(bfqd);
3767 -+
3768 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3769 -+}
3770 -+
3771 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3772 -+{
3773 -+ del_timer_sync(&bfqd->idle_slice_timer);
3774 -+ cancel_work_sync(&bfqd->unplug_work);
3775 -+}
3776 -+
3777 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3778 -+ struct bfq_queue **bfqq_ptr)
3779 -+{
3780 -+ struct bfq_group *root_group = bfqd->root_group;
3781 -+ struct bfq_queue *bfqq = *bfqq_ptr;
3782 -+
3783 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3784 -+ if (bfqq != NULL) {
3785 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3786 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3787 -+ bfqq, atomic_read(&bfqq->ref));
3788 -+ bfq_put_queue(bfqq);
3789 -+ *bfqq_ptr = NULL;
3790 -+ }
3791 -+}
3792 -+
3793 -+/*
3794 -+ * Release all the bfqg references to its async queues. If we are
3795 -+ * deallocating the group these queues may still contain requests, so
3796 -+ * we reparent them to the root cgroup (i.e., the only one that will
3797 -+ * exist for sure untill all the requests on a device are gone).
3798 -+ */
3799 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3800 -+{
3801 -+ int i, j;
3802 -+
3803 -+ for (i = 0; i < 2; i++)
3804 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
3805 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3806 -+
3807 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3808 -+}
3809 -+
3810 -+static void bfq_exit_queue(struct elevator_queue *e)
3811 -+{
3812 -+ struct bfq_data *bfqd = e->elevator_data;
3813 -+ struct request_queue *q = bfqd->queue;
3814 -+ struct bfq_queue *bfqq, *n;
3815 -+
3816 -+ bfq_shutdown_timer_wq(bfqd);
3817 -+
3818 -+ spin_lock_irq(q->queue_lock);
3819 -+
3820 -+ BUG_ON(bfqd->active_queue != NULL);
3821 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3822 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
3823 -+
3824 -+ bfq_disconnect_groups(bfqd);
3825 -+ spin_unlock_irq(q->queue_lock);
3826 -+
3827 -+ bfq_shutdown_timer_wq(bfqd);
3828 -+
3829 -+ synchronize_rcu();
3830 -+
3831 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3832 -+
3833 -+ bfq_free_root_group(bfqd);
3834 -+ kfree(bfqd);
3835 -+}
3836 -+
3837 -+static int bfq_init_queue(struct request_queue *q)
3838 -+{
3839 -+ struct bfq_group *bfqg;
3840 -+ struct bfq_data *bfqd;
3841 -+
3842 -+ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3843 -+ if (bfqd == NULL)
3844 -+ return -ENOMEM;
3845 -+
3846 -+ /*
3847 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3848 -+ * Grab a permanent reference to it, so that the normal code flow
3849 -+ * will not attempt to free it.
3850 -+ */
3851 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
3852 -+ atomic_inc(&bfqd->oom_bfqq.ref);
3853 -+
3854 -+ bfqd->queue = q;
3855 -+ q->elevator->elevator_data = bfqd;
3856 -+
3857 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
3858 -+ if (bfqg == NULL) {
3859 -+ kfree(bfqd);
3860 -+ return -ENOMEM;
3861 -+ }
3862 -+
3863 -+ bfqd->root_group = bfqg;
3864 -+
3865 -+ init_timer(&bfqd->idle_slice_timer);
3866 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3867 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
3868 -+
3869 -+ bfqd->rq_pos_tree = RB_ROOT;
3870 -+
3871 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
3872 -+
3873 -+ INIT_LIST_HEAD(&bfqd->active_list);
3874 -+ INIT_LIST_HEAD(&bfqd->idle_list);
3875 -+
3876 -+ bfqd->hw_tag = -1;
3877 -+
3878 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
3879 -+
3880 -+ bfqd->bfq_quantum = bfq_quantum;
3881 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3882 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3883 -+ bfqd->bfq_back_max = bfq_back_max;
3884 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
3885 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
3886 -+ bfqd->bfq_class_idle_last_service = 0;
3887 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
3888 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
3889 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
3890 -+
3891 -+ bfqd->low_latency = true;
3892 -+
3893 -+ bfqd->bfq_raising_coeff = 20;
3894 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
3895 -+ bfqd->bfq_raising_max_time = 0;
3896 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
3897 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
3898 -+ bfqd->bfq_raising_max_softrt_rate = 7000;
3899 -+
3900 -+ /* Initially estimate the device's peak rate as the reference rate */
3901 -+ if (blk_queue_nonrot(bfqd->queue)) {
3902 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
3903 -+ bfqd->peak_rate = R_nonrot;
3904 -+ } else {
3905 -+ bfqd->RT_prod = R_rot * T_rot;
3906 -+ bfqd->peak_rate = R_rot;
3907 -+ }
3908 -+
3909 -+ return 0;
3910 -+}
3911 -+
3912 -+static void bfq_slab_kill(void)
3913 -+{
3914 -+ if (bfq_pool != NULL)
3915 -+ kmem_cache_destroy(bfq_pool);
3916 -+}
3917 -+
3918 -+static int __init bfq_slab_setup(void)
3919 -+{
3920 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
3921 -+ if (bfq_pool == NULL)
3922 -+ return -ENOMEM;
3923 -+ return 0;
3924 -+}
3925 -+
3926 -+static ssize_t bfq_var_show(unsigned int var, char *page)
3927 -+{
3928 -+ return sprintf(page, "%d\n", var);
3929 -+}
3930 -+
3931 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
3932 -+{
3933 -+ unsigned long new_val;
3934 -+ int ret = strict_strtoul(page, 10, &new_val);
3935 -+
3936 -+ if (ret == 0)
3937 -+ *var = new_val;
3938 -+
3939 -+ return count;
3940 -+}
3941 -+
3942 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
3943 -+{
3944 -+ struct bfq_data *bfqd = e->elevator_data;
3945 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
3946 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
3947 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
3948 -+}
3949 -+
3950 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
3951 -+{
3952 -+ struct bfq_queue *bfqq;
3953 -+ struct bfq_data *bfqd = e->elevator_data;
3954 -+ ssize_t num_char = 0;
3955 -+
3956 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
3957 -+ bfqd->queued);
3958 -+
3959 -+ spin_lock_irq(bfqd->queue->queue_lock);
3960 -+
3961 -+ num_char += sprintf(page + num_char, "Active:\n");
3962 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
3963 -+ num_char += sprintf(page + num_char,
3964 -+ "pid%d: weight %hu, nr_queued %d %d,"
3965 -+ " dur %d/%u\n",
3966 -+ bfqq->pid,
3967 -+ bfqq->entity.weight,
3968 -+ bfqq->queued[0],
3969 -+ bfqq->queued[1],
3970 -+ jiffies_to_msecs(jiffies -
3971 -+ bfqq->last_rais_start_finish),
3972 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
3973 -+ }
3974 -+
3975 -+ num_char += sprintf(page + num_char, "Idle:\n");
3976 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
3977 -+ num_char += sprintf(page + num_char,
3978 -+ "pid%d: weight %hu, dur %d/%u\n",
3979 -+ bfqq->pid,
3980 -+ bfqq->entity.weight,
3981 -+ jiffies_to_msecs(jiffies -
3982 -+ bfqq->last_rais_start_finish),
3983 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
3984 -+ }
3985 -+
3986 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3987 -+
3988 -+ return num_char;
3989 -+}
3990 -+
3991 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3992 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3993 -+{ \
3994 -+ struct bfq_data *bfqd = e->elevator_data; \
3995 -+ unsigned int __data = __VAR; \
3996 -+ if (__CONV) \
3997 -+ __data = jiffies_to_msecs(__data); \
3998 -+ return bfq_var_show(__data, (page)); \
3999 -+}
4000 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4001 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4002 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4003 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4004 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4005 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4006 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4007 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4008 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4009 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4010 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4011 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4012 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4013 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4014 -+ 1);
4015 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4016 -+ bfqd->bfq_raising_min_inter_arr_async,
4017 -+ 1);
4018 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4019 -+ bfqd->bfq_raising_max_softrt_rate, 0);
4020 -+#undef SHOW_FUNCTION
4021 -+
4022 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4023 -+static ssize_t \
4024 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4025 -+{ \
4026 -+ struct bfq_data *bfqd = e->elevator_data; \
4027 -+ unsigned long uninitialized_var(__data); \
4028 -+ int ret = bfq_var_store(&__data, (page), count); \
4029 -+ if (__data < (MIN)) \
4030 -+ __data = (MIN); \
4031 -+ else if (__data > (MAX)) \
4032 -+ __data = (MAX); \
4033 -+ if (__CONV) \
4034 -+ *(__PTR) = msecs_to_jiffies(__data); \
4035 -+ else \
4036 -+ *(__PTR) = __data; \
4037 -+ return ret; \
4038 -+}
4039 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4040 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4041 -+ INT_MAX, 1);
4042 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4043 -+ INT_MAX, 1);
4044 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4045 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4046 -+ INT_MAX, 0);
4047 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4048 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4049 -+ 1, INT_MAX, 0);
4050 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4051 -+ INT_MAX, 1);
4052 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4053 -+ INT_MAX, 0);
4054 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4055 -+ INT_MAX, 1);
4056 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4057 -+ INT_MAX, 1);
4058 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
4059 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4060 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4061 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4062 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4063 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4064 -+#undef STORE_FUNCTION
4065 -+
4066 -+/* do nothing for the moment */
4067 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
4068 -+ const char *page, size_t count)
4069 -+{
4070 -+ return count;
4071 -+}
4072 -+
4073 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4074 -+{
4075 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4076 -+
4077 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4078 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4079 -+ else
4080 -+ return bfq_default_max_budget;
4081 -+}
4082 -+
4083 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4084 -+ const char *page, size_t count)
4085 -+{
4086 -+ struct bfq_data *bfqd = e->elevator_data;
4087 -+ unsigned long uninitialized_var(__data);
4088 -+ int ret = bfq_var_store(&__data, (page), count);
4089 -+
4090 -+ if (__data == 0)
4091 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4092 -+ else {
4093 -+ if (__data > INT_MAX)
4094 -+ __data = INT_MAX;
4095 -+ bfqd->bfq_max_budget = __data;
4096 -+ }
4097 -+
4098 -+ bfqd->bfq_user_max_budget = __data;
4099 -+
4100 -+ return ret;
4101 -+}
4102 -+
4103 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4104 -+ const char *page, size_t count)
4105 -+{
4106 -+ struct bfq_data *bfqd = e->elevator_data;
4107 -+ unsigned long uninitialized_var(__data);
4108 -+ int ret = bfq_var_store(&__data, (page), count);
4109 -+
4110 -+ if (__data < 1)
4111 -+ __data = 1;
4112 -+ else if (__data > INT_MAX)
4113 -+ __data = INT_MAX;
4114 -+
4115 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4116 -+ if (bfqd->bfq_user_max_budget == 0)
4117 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4118 -+
4119 -+ return ret;
4120 -+}
4121 -+
4122 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4123 -+ const char *page, size_t count)
4124 -+{
4125 -+ struct bfq_data *bfqd = e->elevator_data;
4126 -+ unsigned long uninitialized_var(__data);
4127 -+ int ret = bfq_var_store(&__data, (page), count);
4128 -+
4129 -+ if (__data > 1)
4130 -+ __data = 1;
4131 -+ if (__data == 0 && bfqd->low_latency != 0)
4132 -+ bfq_end_raising(bfqd);
4133 -+ bfqd->low_latency = __data;
4134 -+
4135 -+ return ret;
4136 -+}
4137 -+
4138 -+#define BFQ_ATTR(name) \
4139 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4140 -+
4141 -+static struct elv_fs_entry bfq_attrs[] = {
4142 -+ BFQ_ATTR(quantum),
4143 -+ BFQ_ATTR(fifo_expire_sync),
4144 -+ BFQ_ATTR(fifo_expire_async),
4145 -+ BFQ_ATTR(back_seek_max),
4146 -+ BFQ_ATTR(back_seek_penalty),
4147 -+ BFQ_ATTR(slice_idle),
4148 -+ BFQ_ATTR(max_budget),
4149 -+ BFQ_ATTR(max_budget_async_rq),
4150 -+ BFQ_ATTR(timeout_sync),
4151 -+ BFQ_ATTR(timeout_async),
4152 -+ BFQ_ATTR(low_latency),
4153 -+ BFQ_ATTR(raising_coeff),
4154 -+ BFQ_ATTR(raising_max_time),
4155 -+ BFQ_ATTR(raising_rt_max_time),
4156 -+ BFQ_ATTR(raising_min_idle_time),
4157 -+ BFQ_ATTR(raising_min_inter_arr_async),
4158 -+ BFQ_ATTR(raising_max_softrt_rate),
4159 -+ BFQ_ATTR(weights),
4160 -+ __ATTR_NULL
4161 -+};
4162 -+
4163 -+static struct elevator_type iosched_bfq = {
4164 -+ .ops = {
4165 -+ .elevator_merge_fn = bfq_merge,
4166 -+ .elevator_merged_fn = bfq_merged_request,
4167 -+ .elevator_merge_req_fn = bfq_merged_requests,
4168 -+ .elevator_allow_merge_fn = bfq_allow_merge,
4169 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
4170 -+ .elevator_add_req_fn = bfq_insert_request,
4171 -+ .elevator_activate_req_fn = bfq_activate_request,
4172 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
4173 -+ .elevator_completed_req_fn = bfq_completed_request,
4174 -+ .elevator_former_req_fn = elv_rb_former_request,
4175 -+ .elevator_latter_req_fn = elv_rb_latter_request,
4176 -+ .elevator_init_icq_fn = bfq_init_icq,
4177 -+ .elevator_exit_icq_fn = bfq_exit_icq,
4178 -+ .elevator_set_req_fn = bfq_set_request,
4179 -+ .elevator_put_req_fn = bfq_put_request,
4180 -+ .elevator_may_queue_fn = bfq_may_queue,
4181 -+ .elevator_init_fn = bfq_init_queue,
4182 -+ .elevator_exit_fn = bfq_exit_queue,
4183 -+ },
4184 -+ .icq_size = sizeof(struct bfq_io_cq),
4185 -+ .icq_align = __alignof__(struct bfq_io_cq),
4186 -+ .elevator_attrs = bfq_attrs,
4187 -+ .elevator_name = "bfq",
4188 -+ .elevator_owner = THIS_MODULE,
4189 -+};
4190 -+
4191 -+static int __init bfq_init(void)
4192 -+{
4193 -+ /*
4194 -+ * Can be 0 on HZ < 1000 setups.
4195 -+ */
4196 -+ if (bfq_slice_idle == 0)
4197 -+ bfq_slice_idle = 1;
4198 -+
4199 -+ if (bfq_timeout_async == 0)
4200 -+ bfq_timeout_async = 1;
4201 -+
4202 -+ if (bfq_slab_setup())
4203 -+ return -ENOMEM;
4204 -+
4205 -+ elv_register(&iosched_bfq);
4206 -+
4207 -+ return 0;
4208 -+}
4209 -+
4210 -+static void __exit bfq_exit(void)
4211 -+{
4212 -+ elv_unregister(&iosched_bfq);
4213 -+ bfq_slab_kill();
4214 -+}
4215 -+
4216 -+module_init(bfq_init);
4217 -+module_exit(bfq_exit);
4218 -+
4219 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4220 -+MODULE_LICENSE("GPL");
4221 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4222 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4223 -new file mode 100644
4224 -index 0000000..03f8061
4225 ---- /dev/null
4226 -+++ b/block/bfq-sched.c
4227 -@@ -0,0 +1,1072 @@
4228 -+/*
4229 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
4230 -+ *
4231 -+ * Based on ideas and code from CFQ:
4232 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4233 -+ *
4234 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4235 -+ * Paolo Valente <paolo.valente@×××××××.it>
4236 -+ *
4237 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4238 -+ */
4239 -+
4240 -+#ifdef CONFIG_CGROUP_BFQIO
4241 -+#define for_each_entity(entity) \
4242 -+ for (; entity != NULL; entity = entity->parent)
4243 -+
4244 -+#define for_each_entity_safe(entity, parent) \
4245 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4246 -+
4247 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4248 -+ int extract,
4249 -+ struct bfq_data *bfqd);
4250 -+
4251 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4252 -+{
4253 -+ struct bfq_entity *bfqg_entity;
4254 -+ struct bfq_group *bfqg;
4255 -+ struct bfq_sched_data *group_sd;
4256 -+
4257 -+ BUG_ON(next_active == NULL);
4258 -+
4259 -+ group_sd = next_active->sched_data;
4260 -+
4261 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4262 -+ /*
4263 -+ * bfq_group's my_entity field is not NULL only if the group
4264 -+ * is not the root group. We must not touch the root entity
4265 -+ * as it must never become an active entity.
4266 -+ */
4267 -+ bfqg_entity = bfqg->my_entity;
4268 -+ if (bfqg_entity != NULL)
4269 -+ bfqg_entity->budget = next_active->budget;
4270 -+}
4271 -+
4272 -+static int bfq_update_next_active(struct bfq_sched_data *sd)
4273 -+{
4274 -+ struct bfq_entity *next_active;
4275 -+
4276 -+ if (sd->active_entity != NULL)
4277 -+ /* will update/requeue at the end of service */
4278 -+ return 0;
4279 -+
4280 -+ /*
4281 -+ * NOTE: this can be improved in many ways, such as returning
4282 -+ * 1 (and thus propagating upwards the update) only when the
4283 -+ * budget changes, or caching the bfqq that will be scheduled
4284 -+ * next from this subtree. By now we worry more about
4285 -+ * correctness than about performance...
4286 -+ */
4287 -+ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4288 -+ sd->next_active = next_active;
4289 -+
4290 -+ if (next_active != NULL)
4291 -+ bfq_update_budget(next_active);
4292 -+
4293 -+ return 1;
4294 -+}
4295 -+
4296 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4297 -+ struct bfq_entity *entity)
4298 -+{
4299 -+ BUG_ON(sd->next_active != entity);
4300 -+}
4301 -+#else
4302 -+#define for_each_entity(entity) \
4303 -+ for (; entity != NULL; entity = NULL)
4304 -+
4305 -+#define for_each_entity_safe(entity, parent) \
4306 -+ for (parent = NULL; entity != NULL; entity = parent)
4307 -+
4308 -+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4309 -+{
4310 -+ return 0;
4311 -+}
4312 -+
4313 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4314 -+ struct bfq_entity *entity)
4315 -+{
4316 -+}
4317 -+
4318 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4319 -+{
4320 -+}
4321 -+#endif
4322 -+
4323 -+/*
4324 -+ * Shift for timestamp calculations. This actually limits the maximum
4325 -+ * service allowed in one timestamp delta (small shift values increase it),
4326 -+ * the maximum total weight that can be used for the queues in the system
4327 -+ * (big shift values increase it), and the period of virtual time wraparounds.
4328 -+ */
4329 -+#define WFQ_SERVICE_SHIFT 22
4330 -+
4331 -+/**
4332 -+ * bfq_gt - compare two timestamps.
4333 -+ * @a: first ts.
4334 -+ * @b: second ts.
4335 -+ *
4336 -+ * Return @a > @b, dealing with wrapping correctly.
4337 -+ */
4338 -+static inline int bfq_gt(u64 a, u64 b)
4339 -+{
4340 -+ return (s64)(a - b) > 0;
4341 -+}
4342 -+
4343 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4344 -+{
4345 -+ struct bfq_queue *bfqq = NULL;
4346 -+
4347 -+ BUG_ON(entity == NULL);
4348 -+
4349 -+ if (entity->my_sched_data == NULL)
4350 -+ bfqq = container_of(entity, struct bfq_queue, entity);
4351 -+
4352 -+ return bfqq;
4353 -+}
4354 -+
4355 -+
4356 -+/**
4357 -+ * bfq_delta - map service into the virtual time domain.
4358 -+ * @service: amount of service.
4359 -+ * @weight: scale factor (weight of an entity or weight sum).
4360 -+ */
4361 -+static inline u64 bfq_delta(unsigned long service,
4362 -+ unsigned long weight)
4363 -+{
4364 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4365 -+
4366 -+ do_div(d, weight);
4367 -+ return d;
4368 -+}
4369 -+
4370 -+/**
4371 -+ * bfq_calc_finish - assign the finish time to an entity.
4372 -+ * @entity: the entity to act upon.
4373 -+ * @service: the service to be charged to the entity.
4374 -+ */
4375 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
4376 -+ unsigned long service)
4377 -+{
4378 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4379 -+
4380 -+ BUG_ON(entity->weight == 0);
4381 -+
4382 -+ entity->finish = entity->start +
4383 -+ bfq_delta(service, entity->weight);
4384 -+
4385 -+ if (bfqq != NULL) {
4386 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4387 -+ "calc_finish: serv %lu, w %d",
4388 -+ service, entity->weight);
4389 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4390 -+ "calc_finish: start %llu, finish %llu, delta %llu",
4391 -+ entity->start, entity->finish,
4392 -+ bfq_delta(service, entity->weight));
4393 -+ }
4394 -+}
4395 -+
4396 -+/**
4397 -+ * bfq_entity_of - get an entity from a node.
4398 -+ * @node: the node field of the entity.
4399 -+ *
4400 -+ * Convert a node pointer to the relative entity. This is used only
4401 -+ * to simplify the logic of some functions and not as the generic
4402 -+ * conversion mechanism because, e.g., in the tree walking functions,
4403 -+ * the check for a %NULL value would be redundant.
4404 -+ */
4405 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4406 -+{
4407 -+ struct bfq_entity *entity = NULL;
4408 -+
4409 -+ if (node != NULL)
4410 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
4411 -+
4412 -+ return entity;
4413 -+}
4414 -+
4415 -+/**
4416 -+ * bfq_extract - remove an entity from a tree.
4417 -+ * @root: the tree root.
4418 -+ * @entity: the entity to remove.
4419 -+ */
4420 -+static inline void bfq_extract(struct rb_root *root,
4421 -+ struct bfq_entity *entity)
4422 -+{
4423 -+ BUG_ON(entity->tree != root);
4424 -+
4425 -+ entity->tree = NULL;
4426 -+ rb_erase(&entity->rb_node, root);
4427 -+}
4428 -+
4429 -+/**
4430 -+ * bfq_idle_extract - extract an entity from the idle tree.
4431 -+ * @st: the service tree of the owning @entity.
4432 -+ * @entity: the entity being removed.
4433 -+ */
4434 -+static void bfq_idle_extract(struct bfq_service_tree *st,
4435 -+ struct bfq_entity *entity)
4436 -+{
4437 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4438 -+ struct rb_node *next;
4439 -+
4440 -+ BUG_ON(entity->tree != &st->idle);
4441 -+
4442 -+ if (entity == st->first_idle) {
4443 -+ next = rb_next(&entity->rb_node);
4444 -+ st->first_idle = bfq_entity_of(next);
4445 -+ }
4446 -+
4447 -+ if (entity == st->last_idle) {
4448 -+ next = rb_prev(&entity->rb_node);
4449 -+ st->last_idle = bfq_entity_of(next);
4450 -+ }
4451 -+
4452 -+ bfq_extract(&st->idle, entity);
4453 -+
4454 -+ if (bfqq != NULL)
4455 -+ list_del(&bfqq->bfqq_list);
4456 -+}
4457 -+
4458 -+/**
4459 -+ * bfq_insert - generic tree insertion.
4460 -+ * @root: tree root.
4461 -+ * @entity: entity to insert.
4462 -+ *
4463 -+ * This is used for the idle and the active tree, since they are both
4464 -+ * ordered by finish time.
4465 -+ */
4466 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4467 -+{
4468 -+ struct bfq_entity *entry;
4469 -+ struct rb_node **node = &root->rb_node;
4470 -+ struct rb_node *parent = NULL;
4471 -+
4472 -+ BUG_ON(entity->tree != NULL);
4473 -+
4474 -+ while (*node != NULL) {
4475 -+ parent = *node;
4476 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
4477 -+
4478 -+ if (bfq_gt(entry->finish, entity->finish))
4479 -+ node = &parent->rb_left;
4480 -+ else
4481 -+ node = &parent->rb_right;
4482 -+ }
4483 -+
4484 -+ rb_link_node(&entity->rb_node, parent, node);
4485 -+ rb_insert_color(&entity->rb_node, root);
4486 -+
4487 -+ entity->tree = root;
4488 -+}
4489 -+
4490 -+/**
4491 -+ * bfq_update_min - update the min_start field of a entity.
4492 -+ * @entity: the entity to update.
4493 -+ * @node: one of its children.
4494 -+ *
4495 -+ * This function is called when @entity may store an invalid value for
4496 -+ * min_start due to updates to the active tree. The function assumes
4497 -+ * that the subtree rooted at @node (which may be its left or its right
4498 -+ * child) has a valid min_start value.
4499 -+ */
4500 -+static inline void bfq_update_min(struct bfq_entity *entity,
4501 -+ struct rb_node *node)
4502 -+{
4503 -+ struct bfq_entity *child;
4504 -+
4505 -+ if (node != NULL) {
4506 -+ child = rb_entry(node, struct bfq_entity, rb_node);
4507 -+ if (bfq_gt(entity->min_start, child->min_start))
4508 -+ entity->min_start = child->min_start;
4509 -+ }
4510 -+}
4511 -+
4512 -+/**
4513 -+ * bfq_update_active_node - recalculate min_start.
4514 -+ * @node: the node to update.
4515 -+ *
4516 -+ * @node may have changed position or one of its children may have moved,
4517 -+ * this function updates its min_start value. The left and right subtrees
4518 -+ * are assumed to hold a correct min_start value.
4519 -+ */
4520 -+static inline void bfq_update_active_node(struct rb_node *node)
4521 -+{
4522 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4523 -+
4524 -+ entity->min_start = entity->start;
4525 -+ bfq_update_min(entity, node->rb_right);
4526 -+ bfq_update_min(entity, node->rb_left);
4527 -+}
4528 -+
4529 -+/**
4530 -+ * bfq_update_active_tree - update min_start for the whole active tree.
4531 -+ * @node: the starting node.
4532 -+ *
4533 -+ * @node must be the deepest modified node after an update. This function
4534 -+ * updates its min_start using the values held by its children, assuming
4535 -+ * that they did not change, and then updates all the nodes that may have
4536 -+ * changed in the path to the root. The only nodes that may have changed
4537 -+ * are the ones in the path or their siblings.
4538 -+ */
4539 -+static void bfq_update_active_tree(struct rb_node *node)
4540 -+{
4541 -+ struct rb_node *parent;
4542 -+
4543 -+up:
4544 -+ bfq_update_active_node(node);
4545 -+
4546 -+ parent = rb_parent(node);
4547 -+ if (parent == NULL)
4548 -+ return;
4549 -+
4550 -+ if (node == parent->rb_left && parent->rb_right != NULL)
4551 -+ bfq_update_active_node(parent->rb_right);
4552 -+ else if (parent->rb_left != NULL)
4553 -+ bfq_update_active_node(parent->rb_left);
4554 -+
4555 -+ node = parent;
4556 -+ goto up;
4557 -+}
4558 -+
4559 -+/**
4560 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
4561 -+ * @st: the service tree of the entity.
4562 -+ * @entity: the entity being inserted.
4563 -+ *
4564 -+ * The active tree is ordered by finish time, but an extra key is kept
4565 -+ * per each node, containing the minimum value for the start times of
4566 -+ * its children (and the node itself), so it's possible to search for
4567 -+ * the eligible node with the lowest finish time in logarithmic time.
4568 -+ */
4569 -+static void bfq_active_insert(struct bfq_service_tree *st,
4570 -+ struct bfq_entity *entity)
4571 -+{
4572 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4573 -+ struct rb_node *node = &entity->rb_node;
4574 -+
4575 -+ bfq_insert(&st->active, entity);
4576 -+
4577 -+ if (node->rb_left != NULL)
4578 -+ node = node->rb_left;
4579 -+ else if (node->rb_right != NULL)
4580 -+ node = node->rb_right;
4581 -+
4582 -+ bfq_update_active_tree(node);
4583 -+
4584 -+ if (bfqq != NULL)
4585 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4586 -+}
4587 -+
4588 -+/**
4589 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4590 -+ * @ioprio: the ioprio value to convert.
4591 -+ */
4592 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
4593 -+{
4594 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4595 -+ return IOPRIO_BE_NR - ioprio;
4596 -+}
4597 -+
4598 -+/**
4599 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4600 -+ * @weight: the weight value to convert.
4601 -+ *
4602 -+ * To preserve as mush as possible the old only-ioprio user interface,
4603 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
4604 -+ * larger than IOPRIO_BE_NR
4605 -+ */
4606 -+static unsigned short bfq_weight_to_ioprio(int weight)
4607 -+{
4608 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4609 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4610 -+}
4611 -+
4612 -+static inline void bfq_get_entity(struct bfq_entity *entity)
4613 -+{
4614 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4615 -+ struct bfq_sched_data *sd;
4616 -+
4617 -+ if (bfqq != NULL) {
4618 -+ sd = entity->sched_data;
4619 -+ atomic_inc(&bfqq->ref);
4620 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4621 -+ bfqq, atomic_read(&bfqq->ref));
4622 -+ }
4623 -+}
4624 -+
4625 -+/**
4626 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
4627 -+ * @node: the node being removed.
4628 -+ *
4629 -+ * Do the first step of an extraction in an rb tree, looking for the
4630 -+ * node that will replace @node, and returning the deepest node that
4631 -+ * the following modifications to the tree can touch. If @node is the
4632 -+ * last node in the tree return %NULL.
4633 -+ */
4634 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
4635 -+{
4636 -+ struct rb_node *deepest;
4637 -+
4638 -+ if (node->rb_right == NULL && node->rb_left == NULL)
4639 -+ deepest = rb_parent(node);
4640 -+ else if (node->rb_right == NULL)
4641 -+ deepest = node->rb_left;
4642 -+ else if (node->rb_left == NULL)
4643 -+ deepest = node->rb_right;
4644 -+ else {
4645 -+ deepest = rb_next(node);
4646 -+ if (deepest->rb_right != NULL)
4647 -+ deepest = deepest->rb_right;
4648 -+ else if (rb_parent(deepest) != node)
4649 -+ deepest = rb_parent(deepest);
4650 -+ }
4651 -+
4652 -+ return deepest;
4653 -+}
4654 -+
4655 -+/**
4656 -+ * bfq_active_extract - remove an entity from the active tree.
4657 -+ * @st: the service_tree containing the tree.
4658 -+ * @entity: the entity being removed.
4659 -+ */
4660 -+static void bfq_active_extract(struct bfq_service_tree *st,
4661 -+ struct bfq_entity *entity)
4662 -+{
4663 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4664 -+ struct rb_node *node;
4665 -+
4666 -+ node = bfq_find_deepest(&entity->rb_node);
4667 -+ bfq_extract(&st->active, entity);
4668 -+
4669 -+ if (node != NULL)
4670 -+ bfq_update_active_tree(node);
4671 -+
4672 -+ if (bfqq != NULL)
4673 -+ list_del(&bfqq->bfqq_list);
4674 -+}
4675 -+
4676 -+/**
4677 -+ * bfq_idle_insert - insert an entity into the idle tree.
4678 -+ * @st: the service tree containing the tree.
4679 -+ * @entity: the entity to insert.
4680 -+ */
4681 -+static void bfq_idle_insert(struct bfq_service_tree *st,
4682 -+ struct bfq_entity *entity)
4683 -+{
4684 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4685 -+ struct bfq_entity *first_idle = st->first_idle;
4686 -+ struct bfq_entity *last_idle = st->last_idle;
4687 -+
4688 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
4689 -+ st->first_idle = entity;
4690 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
4691 -+ st->last_idle = entity;
4692 -+
4693 -+ bfq_insert(&st->idle, entity);
4694 -+
4695 -+ if (bfqq != NULL)
4696 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
4697 -+}
4698 -+
4699 -+/**
4700 -+ * bfq_forget_entity - remove an entity from the wfq trees.
4701 -+ * @st: the service tree.
4702 -+ * @entity: the entity being removed.
4703 -+ *
4704 -+ * Update the device status and forget everything about @entity, putting
4705 -+ * the device reference to it, if it is a queue. Entities belonging to
4706 -+ * groups are not refcounted.
4707 -+ */
4708 -+static void bfq_forget_entity(struct bfq_service_tree *st,
4709 -+ struct bfq_entity *entity)
4710 -+{
4711 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4712 -+ struct bfq_sched_data *sd;
4713 -+
4714 -+ BUG_ON(!entity->on_st);
4715 -+
4716 -+ entity->on_st = 0;
4717 -+ st->wsum -= entity->weight;
4718 -+ if (bfqq != NULL) {
4719 -+ sd = entity->sched_data;
4720 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
4721 -+ bfqq, atomic_read(&bfqq->ref));
4722 -+ bfq_put_queue(bfqq);
4723 -+ }
4724 -+}
4725 -+
4726 -+/**
4727 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
4728 -+ * @st: service tree for the entity.
4729 -+ * @entity: the entity being released.
4730 -+ */
4731 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
4732 -+ struct bfq_entity *entity)
4733 -+{
4734 -+ bfq_idle_extract(st, entity);
4735 -+ bfq_forget_entity(st, entity);
4736 -+}
4737 -+
4738 -+/**
4739 -+ * bfq_forget_idle - update the idle tree if necessary.
4740 -+ * @st: the service tree to act upon.
4741 -+ *
4742 -+ * To preserve the global O(log N) complexity we only remove one entry here;
4743 -+ * as the idle tree will not grow indefinitely this can be done safely.
4744 -+ */
4745 -+static void bfq_forget_idle(struct bfq_service_tree *st)
4746 -+{
4747 -+ struct bfq_entity *first_idle = st->first_idle;
4748 -+ struct bfq_entity *last_idle = st->last_idle;
4749 -+
4750 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
4751 -+ !bfq_gt(last_idle->finish, st->vtime)) {
4752 -+ /*
4753 -+ * Forget the whole idle tree, increasing the vtime past
4754 -+ * the last finish time of idle entities.
4755 -+ */
4756 -+ st->vtime = last_idle->finish;
4757 -+ }
4758 -+
4759 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
4760 -+ bfq_put_idle_entity(st, first_idle);
4761 -+}
4762 -+
4763 -+static struct bfq_service_tree *
4764 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
4765 -+ struct bfq_entity *entity)
4766 -+{
4767 -+ struct bfq_service_tree *new_st = old_st;
4768 -+
4769 -+ if (entity->ioprio_changed) {
4770 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4771 -+
4772 -+ BUG_ON(old_st->wsum < entity->weight);
4773 -+ old_st->wsum -= entity->weight;
4774 -+
4775 -+ if (entity->new_weight != entity->orig_weight) {
4776 -+ entity->orig_weight = entity->new_weight;
4777 -+ entity->ioprio =
4778 -+ bfq_weight_to_ioprio(entity->orig_weight);
4779 -+ } else if (entity->new_ioprio != entity->ioprio) {
4780 -+ entity->ioprio = entity->new_ioprio;
4781 -+ entity->orig_weight =
4782 -+ bfq_ioprio_to_weight(entity->ioprio);
4783 -+ } else
4784 -+ entity->new_weight = entity->orig_weight =
4785 -+ bfq_ioprio_to_weight(entity->ioprio);
4786 -+
4787 -+ entity->ioprio_class = entity->new_ioprio_class;
4788 -+ entity->ioprio_changed = 0;
4789 -+
4790 -+ /*
4791 -+ * NOTE: here we may be changing the weight too early,
4792 -+ * this will cause unfairness. The correct approach
4793 -+ * would have required additional complexity to defer
4794 -+ * weight changes to the proper time instants (i.e.,
4795 -+ * when entity->finish <= old_st->vtime).
4796 -+ */
4797 -+ new_st = bfq_entity_service_tree(entity);
4798 -+ entity->weight = entity->orig_weight *
4799 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
4800 -+ new_st->wsum += entity->weight;
4801 -+
4802 -+ if (new_st != old_st)
4803 -+ entity->start = new_st->vtime;
4804 -+ }
4805 -+
4806 -+ return new_st;
4807 -+}
4808 -+
4809 -+/**
4810 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
4811 -+ * @bfqq: the queue being served.
4812 -+ * @served: bytes to transfer.
4813 -+ *
4814 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
4815 -+ * are synchronized every time a new bfqq is selected for service. By now,
4816 -+ * we keep it to better check consistency.
4817 -+ */
4818 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
4819 -+{
4820 -+ struct bfq_entity *entity = &bfqq->entity;
4821 -+ struct bfq_service_tree *st;
4822 -+
4823 -+ for_each_entity(entity) {
4824 -+ st = bfq_entity_service_tree(entity);
4825 -+
4826 -+ entity->service += served;
4827 -+ BUG_ON(entity->service > entity->budget);
4828 -+ BUG_ON(st->wsum == 0);
4829 -+
4830 -+ st->vtime += bfq_delta(served, st->wsum);
4831 -+ bfq_forget_idle(st);
4832 -+ }
4833 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
4834 -+}
4835 -+
4836 -+/**
4837 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
4838 -+ * @bfqq: the queue that needs a service update.
4839 -+ *
4840 -+ * When it's not possible to be fair in the service domain, because
4841 -+ * a queue is not consuming its budget fast enough (the meaning of
4842 -+ * fast depends on the timeout parameter), we charge it a full
4843 -+ * budget. In this way we should obtain a sort of time-domain
4844 -+ * fairness among all the seeky/slow queues.
4845 -+ */
4846 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
4847 -+{
4848 -+ struct bfq_entity *entity = &bfqq->entity;
4849 -+
4850 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
4851 -+
4852 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
4853 -+}
4854 -+
4855 -+/**
4856 -+ * __bfq_activate_entity - activate an entity.
4857 -+ * @entity: the entity being activated.
4858 -+ *
4859 -+ * Called whenever an entity is activated, i.e., it is not active and one
4860 -+ * of its children receives a new request, or has to be reactivated due to
4861 -+ * budget exhaustion. It uses the current budget of the entity (and the
4862 -+ * service received if @entity is active) of the queue to calculate its
4863 -+ * timestamps.
4864 -+ */
4865 -+static void __bfq_activate_entity(struct bfq_entity *entity)
4866 -+{
4867 -+ struct bfq_sched_data *sd = entity->sched_data;
4868 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4869 -+
4870 -+ if (entity == sd->active_entity) {
4871 -+ BUG_ON(entity->tree != NULL);
4872 -+ /*
4873 -+ * If we are requeueing the current entity we have
4874 -+ * to take care of not charging to it service it has
4875 -+ * not received.
4876 -+ */
4877 -+ bfq_calc_finish(entity, entity->service);
4878 -+ entity->start = entity->finish;
4879 -+ sd->active_entity = NULL;
4880 -+ } else if (entity->tree == &st->active) {
4881 -+ /*
4882 -+ * Requeueing an entity due to a change of some
4883 -+ * next_active entity below it. We reuse the old
4884 -+ * start time.
4885 -+ */
4886 -+ bfq_active_extract(st, entity);
4887 -+ } else if (entity->tree == &st->idle) {
4888 -+ /*
4889 -+ * Must be on the idle tree, bfq_idle_extract() will
4890 -+ * check for that.
4891 -+ */
4892 -+ bfq_idle_extract(st, entity);
4893 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
4894 -+ st->vtime : entity->finish;
4895 -+ } else {
4896 -+ /*
4897 -+ * The finish time of the entity may be invalid, and
4898 -+ * it is in the past for sure, otherwise the queue
4899 -+ * would have been on the idle tree.
4900 -+ */
4901 -+ entity->start = st->vtime;
4902 -+ st->wsum += entity->weight;
4903 -+ bfq_get_entity(entity);
4904 -+
4905 -+ BUG_ON(entity->on_st);
4906 -+ entity->on_st = 1;
4907 -+ }
4908 -+
4909 -+ st = __bfq_entity_update_weight_prio(st, entity);
4910 -+ bfq_calc_finish(entity, entity->budget);
4911 -+ bfq_active_insert(st, entity);
4912 -+}
4913 -+
4914 -+/**
4915 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
4916 -+ * @entity: the entity to activate.
4917 -+ *
4918 -+ * Activate @entity and all the entities on the path from it to the root.
4919 -+ */
4920 -+static void bfq_activate_entity(struct bfq_entity *entity)
4921 -+{
4922 -+ struct bfq_sched_data *sd;
4923 -+
4924 -+ for_each_entity(entity) {
4925 -+ __bfq_activate_entity(entity);
4926 -+
4927 -+ sd = entity->sched_data;
4928 -+ if (!bfq_update_next_active(sd))
4929 -+ /*
4930 -+ * No need to propagate the activation to the
4931 -+ * upper entities, as they will be updated when
4932 -+ * the active entity is rescheduled.
4933 -+ */
4934 -+ break;
4935 -+ }
4936 -+}
4937 -+
4938 -+/**
4939 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
4940 -+ * @entity: the entity to deactivate.
4941 -+ * @requeue: if false, the entity will not be put into the idle tree.
4942 -+ *
4943 -+ * Deactivate an entity, independently from its previous state. If the
4944 -+ * entity was not on a service tree just return, otherwise if it is on
4945 -+ * any scheduler tree, extract it from that tree, and if necessary
4946 -+ * and if the caller did not specify @requeue, put it on the idle tree.
4947 -+ *
4948 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
4949 -+ * if the entity was under service or if it was the next_active for
4950 -+ * its sched_data; return %0 otherwise.
4951 -+ */
4952 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4953 -+{
4954 -+ struct bfq_sched_data *sd = entity->sched_data;
4955 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4956 -+ int was_active = entity == sd->active_entity;
4957 -+ int ret = 0;
4958 -+
4959 -+ if (!entity->on_st)
4960 -+ return 0;
4961 -+
4962 -+ BUG_ON(was_active && entity->tree != NULL);
4963 -+
4964 -+ if (was_active) {
4965 -+ bfq_calc_finish(entity, entity->service);
4966 -+ sd->active_entity = NULL;
4967 -+ } else if (entity->tree == &st->active)
4968 -+ bfq_active_extract(st, entity);
4969 -+ else if (entity->tree == &st->idle)
4970 -+ bfq_idle_extract(st, entity);
4971 -+ else if (entity->tree != NULL)
4972 -+ BUG();
4973 -+
4974 -+ if (was_active || sd->next_active == entity)
4975 -+ ret = bfq_update_next_active(sd);
4976 -+
4977 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
4978 -+ bfq_forget_entity(st, entity);
4979 -+ else
4980 -+ bfq_idle_insert(st, entity);
4981 -+
4982 -+ BUG_ON(sd->active_entity == entity);
4983 -+ BUG_ON(sd->next_active == entity);
4984 -+
4985 -+ return ret;
4986 -+}
4987 -+
4988 -+/**
4989 -+ * bfq_deactivate_entity - deactivate an entity.
4990 -+ * @entity: the entity to deactivate.
4991 -+ * @requeue: true if the entity can be put on the idle tree
4992 -+ */
4993 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4994 -+{
4995 -+ struct bfq_sched_data *sd;
4996 -+ struct bfq_entity *parent;
4997 -+
4998 -+ for_each_entity_safe(entity, parent) {
4999 -+ sd = entity->sched_data;
5000 -+
5001 -+ if (!__bfq_deactivate_entity(entity, requeue))
5002 -+ /*
5003 -+ * The parent entity is still backlogged, and
5004 -+ * we don't need to update it as it is still
5005 -+ * under service.
5006 -+ */
5007 -+ break;
5008 -+
5009 -+ if (sd->next_active != NULL)
5010 -+ /*
5011 -+ * The parent entity is still backlogged and
5012 -+ * the budgets on the path towards the root
5013 -+ * need to be updated.
5014 -+ */
5015 -+ goto update;
5016 -+
5017 -+ /*
5018 -+ * If we reach there the parent is no more backlogged and
5019 -+ * we want to propagate the dequeue upwards.
5020 -+ */
5021 -+ requeue = 1;
5022 -+ }
5023 -+
5024 -+ return;
5025 -+
5026 -+update:
5027 -+ entity = parent;
5028 -+ for_each_entity(entity) {
5029 -+ __bfq_activate_entity(entity);
5030 -+
5031 -+ sd = entity->sched_data;
5032 -+ if (!bfq_update_next_active(sd))
5033 -+ break;
5034 -+ }
5035 -+}
5036 -+
5037 -+/**
5038 -+ * bfq_update_vtime - update vtime if necessary.
5039 -+ * @st: the service tree to act upon.
5040 -+ *
5041 -+ * If necessary update the service tree vtime to have at least one
5042 -+ * eligible entity, skipping to its start time. Assumes that the
5043 -+ * active tree of the device is not empty.
5044 -+ *
5045 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
5046 -+ * we may end up with reactivated tasks getting timestamps after a
5047 -+ * vtime skip done because we needed a ->first_active entity on some
5048 -+ * intermediate node.
5049 -+ */
5050 -+static void bfq_update_vtime(struct bfq_service_tree *st)
5051 -+{
5052 -+ struct bfq_entity *entry;
5053 -+ struct rb_node *node = st->active.rb_node;
5054 -+
5055 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5056 -+ if (bfq_gt(entry->min_start, st->vtime)) {
5057 -+ st->vtime = entry->min_start;
5058 -+ bfq_forget_idle(st);
5059 -+ }
5060 -+}
5061 -+
5062 -+/**
5063 -+ * bfq_first_active - find the eligible entity with the smallest finish time
5064 -+ * @st: the service tree to select from.
5065 -+ *
5066 -+ * This function searches the first schedulable entity, starting from the
5067 -+ * root of the tree and going on the left every time on this side there is
5068 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
5069 -+ * on the right is followed only if a) the left subtree contains no eligible
5070 -+ * entities and b) no eligible entity has been found yet.
5071 -+ */
5072 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5073 -+{
5074 -+ struct bfq_entity *entry, *first = NULL;
5075 -+ struct rb_node *node = st->active.rb_node;
5076 -+
5077 -+ while (node != NULL) {
5078 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5079 -+left:
5080 -+ if (!bfq_gt(entry->start, st->vtime))
5081 -+ first = entry;
5082 -+
5083 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5084 -+
5085 -+ if (node->rb_left != NULL) {
5086 -+ entry = rb_entry(node->rb_left,
5087 -+ struct bfq_entity, rb_node);
5088 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
5089 -+ node = node->rb_left;
5090 -+ goto left;
5091 -+ }
5092 -+ }
5093 -+ if (first != NULL)
5094 -+ break;
5095 -+ node = node->rb_right;
5096 -+ }
5097 -+
5098 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5099 -+ return first;
5100 -+}
5101 -+
5102 -+/**
5103 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5104 -+ * @st: the service tree.
5105 -+ *
5106 -+ * Update the virtual time in @st and return the first eligible entity
5107 -+ * it contains.
5108 -+ */
5109 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5110 -+ bool force)
5111 -+{
5112 -+ struct bfq_entity *entity, *new_next_active = NULL;
5113 -+
5114 -+ if (RB_EMPTY_ROOT(&st->active))
5115 -+ return NULL;
5116 -+
5117 -+ bfq_update_vtime(st);
5118 -+ entity = bfq_first_active_entity(st);
5119 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
5120 -+
5121 -+ /*
5122 -+ * If the chosen entity does not match with the sched_data's
5123 -+ * next_active and we are forcedly serving the IDLE priority
5124 -+ * class tree, bubble up budget update.
5125 -+ */
5126 -+ if (unlikely(force && entity != entity->sched_data->next_active)) {
5127 -+ new_next_active = entity;
5128 -+ for_each_entity(new_next_active)
5129 -+ bfq_update_budget(new_next_active);
5130 -+ }
5131 -+
5132 -+ return entity;
5133 -+}
5134 -+
5135 -+/**
5136 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5137 -+ * @sd: the sched_data.
5138 -+ * @extract: if true the returned entity will be also extracted from @sd.
5139 -+ *
5140 -+ * NOTE: since we cache the next_active entity at each level of the
5141 -+ * hierarchy, the complexity of the lookup can be decreased with
5142 -+ * absolutely no effort just returning the cached next_active value;
5143 -+ * we prefer to do full lookups to test the consistency of * the data
5144 -+ * structures.
5145 -+ */
5146 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5147 -+ int extract,
5148 -+ struct bfq_data *bfqd)
5149 -+{
5150 -+ struct bfq_service_tree *st = sd->service_tree;
5151 -+ struct bfq_entity *entity;
5152 -+ int i=0;
5153 -+
5154 -+ BUG_ON(sd->active_entity != NULL);
5155 -+
5156 -+ if (bfqd != NULL &&
5157 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5158 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
5159 -+ if (entity != NULL) {
5160 -+ i = BFQ_IOPRIO_CLASSES - 1;
5161 -+ bfqd->bfq_class_idle_last_service = jiffies;
5162 -+ sd->next_active = entity;
5163 -+ }
5164 -+ }
5165 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5166 -+ entity = __bfq_lookup_next_entity(st + i, false);
5167 -+ if (entity != NULL) {
5168 -+ if (extract) {
5169 -+ bfq_check_next_active(sd, entity);
5170 -+ bfq_active_extract(st + i, entity);
5171 -+ sd->active_entity = entity;
5172 -+ sd->next_active = NULL;
5173 -+ }
5174 -+ break;
5175 -+ }
5176 -+ }
5177 -+
5178 -+ return entity;
5179 -+}
5180 -+
5181 -+/*
5182 -+ * Get next queue for service.
5183 -+ */
5184 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5185 -+{
5186 -+ struct bfq_entity *entity = NULL;
5187 -+ struct bfq_sched_data *sd;
5188 -+ struct bfq_queue *bfqq;
5189 -+
5190 -+ BUG_ON(bfqd->active_queue != NULL);
5191 -+
5192 -+ if (bfqd->busy_queues == 0)
5193 -+ return NULL;
5194 -+
5195 -+ sd = &bfqd->root_group->sched_data;
5196 -+ for (; sd != NULL; sd = entity->my_sched_data) {
5197 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5198 -+ BUG_ON(entity == NULL);
5199 -+ entity->service = 0;
5200 -+ }
5201 -+
5202 -+ bfqq = bfq_entity_to_bfqq(entity);
5203 -+ BUG_ON(bfqq == NULL);
5204 -+
5205 -+ return bfqq;
5206 -+}
5207 -+
5208 -+/*
5209 -+ * Forced extraction of the given queue.
5210 -+ */
5211 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5212 -+ struct bfq_queue *bfqq)
5213 -+{
5214 -+ struct bfq_entity *entity;
5215 -+ struct bfq_sched_data *sd;
5216 -+
5217 -+ BUG_ON(bfqd->active_queue != NULL);
5218 -+
5219 -+ entity = &bfqq->entity;
5220 -+ /*
5221 -+ * Bubble up extraction/update from the leaf to the root.
5222 -+ */
5223 -+ for_each_entity(entity) {
5224 -+ sd = entity->sched_data;
5225 -+ bfq_update_budget(entity);
5226 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
5227 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5228 -+ sd->active_entity = entity;
5229 -+ sd->next_active = NULL;
5230 -+ entity->service = 0;
5231 -+ }
5232 -+
5233 -+ return;
5234 -+}
5235 -+
5236 -+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
5237 -+{
5238 -+ if (bfqd->active_bic != NULL) {
5239 -+ put_io_context(bfqd->active_bic->icq.ioc);
5240 -+ bfqd->active_bic = NULL;
5241 -+ }
5242 -+
5243 -+ bfqd->active_queue = NULL;
5244 -+ del_timer(&bfqd->idle_slice_timer);
5245 -+}
5246 -+
5247 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5248 -+ int requeue)
5249 -+{
5250 -+ struct bfq_entity *entity = &bfqq->entity;
5251 -+
5252 -+ if (bfqq == bfqd->active_queue)
5253 -+ __bfq_bfqd_reset_active(bfqd);
5254 -+
5255 -+ bfq_deactivate_entity(entity, requeue);
5256 -+}
5257 -+
5258 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5259 -+{
5260 -+ struct bfq_entity *entity = &bfqq->entity;
5261 -+
5262 -+ bfq_activate_entity(entity);
5263 -+}
5264 -+
5265 -+/*
5266 -+ * Called when the bfqq no longer has requests pending, remove it from
5267 -+ * the service tree.
5268 -+ */
5269 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5270 -+ int requeue)
5271 -+{
5272 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
5273 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5274 -+
5275 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5276 -+
5277 -+ bfq_clear_bfqq_busy(bfqq);
5278 -+
5279 -+ BUG_ON(bfqd->busy_queues == 0);
5280 -+ bfqd->busy_queues--;
5281 -+
5282 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5283 -+}
5284 -+
5285 -+/*
5286 -+ * Called when an inactive queue receives a new request.
5287 -+ */
5288 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5289 -+{
5290 -+ BUG_ON(bfq_bfqq_busy(bfqq));
5291 -+ BUG_ON(bfqq == bfqd->active_queue);
5292 -+
5293 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5294 -+
5295 -+ bfq_activate_bfqq(bfqd, bfqq);
5296 -+
5297 -+ bfq_mark_bfqq_busy(bfqq);
5298 -+ bfqd->busy_queues++;
5299 -+}
5300 -diff --git a/block/bfq.h b/block/bfq.h
5301 -new file mode 100644
5302 -index 0000000..48ecde9
5303 ---- /dev/null
5304 -+++ b/block/bfq.h
5305 -@@ -0,0 +1,603 @@
5306 -+/*
5307 -+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.
5308 -+ *
5309 -+ * Based on ideas and code from CFQ:
5310 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5311 -+ *
5312 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5313 -+ * Paolo Valente <paolo.valente@×××××××.it>
5314 -+ *
5315 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5316 -+ */
5317 -+
5318 -+#ifndef _BFQ_H
5319 -+#define _BFQ_H
5320 -+
5321 -+#include <linux/blktrace_api.h>
5322 -+#include <linux/hrtimer.h>
5323 -+#include <linux/ioprio.h>
5324 -+#include <linux/rbtree.h>
5325 -+
5326 -+#define BFQ_IOPRIO_CLASSES 3
5327 -+#define BFQ_CL_IDLE_TIMEOUT HZ/5
5328 -+
5329 -+#define BFQ_MIN_WEIGHT 1
5330 -+#define BFQ_MAX_WEIGHT 1000
5331 -+
5332 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
5333 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
5334 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5335 -+
5336 -+struct bfq_entity;
5337 -+
5338 -+/**
5339 -+ * struct bfq_service_tree - per ioprio_class service tree.
5340 -+ * @active: tree for active entities (i.e., those backlogged).
5341 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5342 -+ * @first_idle: idle entity with minimum F_i.
5343 -+ * @last_idle: idle entity with maximum F_i.
5344 -+ * @vtime: scheduler virtual time.
5345 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5346 -+ *
5347 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5348 -+ * ioprio_class has its own independent scheduler, and so its own
5349 -+ * bfq_service_tree. All the fields are protected by the queue lock
5350 -+ * of the containing bfqd.
5351 -+ */
5352 -+struct bfq_service_tree {
5353 -+ struct rb_root active;
5354 -+ struct rb_root idle;
5355 -+
5356 -+ struct bfq_entity *first_idle;
5357 -+ struct bfq_entity *last_idle;
5358 -+
5359 -+ u64 vtime;
5360 -+ unsigned long wsum;
5361 -+};
5362 -+
5363 -+/**
5364 -+ * struct bfq_sched_data - multi-class scheduler.
5365 -+ * @active_entity: entity under service.
5366 -+ * @next_active: head-of-the-line entity in the scheduler.
5367 -+ * @service_tree: array of service trees, one per ioprio_class.
5368 -+ *
5369 -+ * bfq_sched_data is the basic scheduler queue. It supports three
5370 -+ * ioprio_classes, and can be used either as a toplevel queue or as
5371 -+ * an intermediate queue on a hierarchical setup.
5372 -+ * @next_active points to the active entity of the sched_data service
5373 -+ * trees that will be scheduled next.
5374 -+ *
5375 -+ * The supported ioprio_classes are the same as in CFQ, in descending
5376 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5377 -+ * Requests from higher priority queues are served before all the
5378 -+ * requests from lower priority queues; among requests of the same
5379 -+ * queue requests are served according to B-WF2Q+.
5380 -+ * All the fields are protected by the queue lock of the containing bfqd.
5381 -+ */
5382 -+struct bfq_sched_data {
5383 -+ struct bfq_entity *active_entity;
5384 -+ struct bfq_entity *next_active;
5385 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5386 -+};
5387 -+
5388 -+/**
5389 -+ * struct bfq_entity - schedulable entity.
5390 -+ * @rb_node: service_tree member.
5391 -+ * @on_st: flag, true if the entity is on a tree (either the active or
5392 -+ * the idle one of its service_tree).
5393 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5394 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
5395 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5396 -+ * @min_start: minimum start time of the (active) subtree rooted at
5397 -+ * this entity; used for O(log N) lookups into active trees.
5398 -+ * @service: service received during the last round of service.
5399 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5400 -+ * @weight: weight of the queue
5401 -+ * @parent: parent entity, for hierarchical scheduling.
5402 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5403 -+ * associated scheduler queue, %NULL on leaf nodes.
5404 -+ * @sched_data: the scheduler queue this entity belongs to.
5405 -+ * @ioprio: the ioprio in use.
5406 -+ * @new_weight: when a weight change is requested, the new weight value.
5407 -+ * @orig_weight: original weight, used to implement weight boosting
5408 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5409 -+ * @ioprio_class: the ioprio_class in use.
5410 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
5411 -+ * ioprio_class value.
5412 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5413 -+ * ioprio_class change.
5414 -+ *
5415 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5416 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5417 -+ * entity belongs to the sched_data of the parent group in the cgroup
5418 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
5419 -+ * in @my_sched_data.
5420 -+ *
5421 -+ * Each entity stores independently its priority values; this would
5422 -+ * allow different weights on different devices, but this
5423 -+ * functionality is not exported to userspace by now. Priorities and
5424 -+ * weights are updated lazily, first storing the new values into the
5425 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
5426 -+ * there is a transition in the entity state that allows the priority
5427 -+ * update to take place the effective and the requested priority
5428 -+ * values are synchronized.
5429 -+ *
5430 -+ * Unless cgroups are used, the weight value is calculated from the
5431 -+ * ioprio to export the same interface as CFQ. When dealing with
5432 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
5433 -+ * time to consume their budget and have true sequential behavior, and
5434 -+ * when there are no external factors breaking anticipation) the
5435 -+ * relative weights at each level of the cgroups hierarchy should be
5436 -+ * guaranteed. All the fields are protected by the queue lock of the
5437 -+ * containing bfqd.
5438 -+ */
5439 -+struct bfq_entity {
5440 -+ struct rb_node rb_node;
5441 -+
5442 -+ int on_st;
5443 -+
5444 -+ u64 finish;
5445 -+ u64 start;
5446 -+
5447 -+ struct rb_root *tree;
5448 -+
5449 -+ u64 min_start;
5450 -+
5451 -+ unsigned long service, budget;
5452 -+ unsigned short weight, new_weight;
5453 -+ unsigned short orig_weight;
5454 -+
5455 -+ struct bfq_entity *parent;
5456 -+
5457 -+ struct bfq_sched_data *my_sched_data;
5458 -+ struct bfq_sched_data *sched_data;
5459 -+
5460 -+ unsigned short ioprio, new_ioprio;
5461 -+ unsigned short ioprio_class, new_ioprio_class;
5462 -+
5463 -+ int ioprio_changed;
5464 -+};
5465 -+
5466 -+struct bfq_group;
5467 -+
5468 -+/**
5469 -+ * struct bfq_queue - leaf schedulable entity.
5470 -+ * @ref: reference counter.
5471 -+ * @bfqd: parent bfq_data.
5472 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
5473 -+ * one or more other queues.
5474 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5475 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5476 -+ * @sort_list: sorted list of pending requests.
5477 -+ * @next_rq: if fifo isn't expired, next request to serve.
5478 -+ * @queued: nr of requests queued in @sort_list.
5479 -+ * @allocated: currently allocated requests.
5480 -+ * @meta_pending: pending metadata requests.
5481 -+ * @fifo: fifo list of requests in sort_list.
5482 -+ * @entity: entity representing this queue in the scheduler.
5483 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
5484 -+ * @budget_timeout: budget expiration (in jiffies).
5485 -+ * @dispatched: number of requests on the dispatch list or inside driver.
5486 -+ * @org_ioprio: saved ioprio during boosted periods.
5487 -+ * @flags: status flags.
5488 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5489 -+ * @seek_samples: number of seeks sampled
5490 -+ * @seek_total: sum of the distances of the seeks sampled
5491 -+ * @seek_mean: mean seek distance
5492 -+ * @last_request_pos: position of the last request enqueued
5493 -+ * @pid: pid of the process owning the queue, used for logging purposes.
5494 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5495 -+ * @raising_cur_max_time: current max raising time for this queue
5496 -+ *
5497 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5498 -+ * or more (if it is an async one). @cgroup holds a reference to the
5499 -+ * cgroup, to be sure that it does not disappear while a bfqq still
5500 -+ * references it (mostly to avoid races between request issuing and task
5501 -+ * migration followed by cgroup distruction).
5502 -+ * All the fields are protected by the queue lock of the containing bfqd.
5503 -+ */
5504 -+struct bfq_queue {
5505 -+ atomic_t ref;
5506 -+ struct bfq_data *bfqd;
5507 -+
5508 -+ /* fields for cooperating queues handling */
5509 -+ struct bfq_queue *new_bfqq;
5510 -+ struct rb_node pos_node;
5511 -+ struct rb_root *pos_root;
5512 -+
5513 -+ struct rb_root sort_list;
5514 -+ struct request *next_rq;
5515 -+ int queued[2];
5516 -+ int allocated[2];
5517 -+ int meta_pending;
5518 -+ struct list_head fifo;
5519 -+
5520 -+ struct bfq_entity entity;
5521 -+
5522 -+ unsigned long max_budget;
5523 -+ unsigned long budget_timeout;
5524 -+
5525 -+ int dispatched;
5526 -+
5527 -+ unsigned short org_ioprio;
5528 -+
5529 -+ unsigned int flags;
5530 -+
5531 -+ struct list_head bfqq_list;
5532 -+
5533 -+ unsigned int seek_samples;
5534 -+ u64 seek_total;
5535 -+ sector_t seek_mean;
5536 -+ sector_t last_request_pos;
5537 -+
5538 -+ pid_t pid;
5539 -+
5540 -+ /* weight-raising fields */
5541 -+ unsigned int raising_cur_max_time;
5542 -+ u64 last_rais_start_finish, soft_rt_next_start;
5543 -+ unsigned int raising_coeff;
5544 -+};
5545 -+
5546 -+/**
5547 -+ * struct bfq_ttime - per process thinktime stats.
5548 -+ * @ttime_total: total process thinktime
5549 -+ * @ttime_samples: number of thinktime samples
5550 -+ * @ttime_mean: average process thinktime
5551 -+ */
5552 -+struct bfq_ttime {
5553 -+ unsigned long last_end_request;
5554 -+
5555 -+ unsigned long ttime_total;
5556 -+ unsigned long ttime_samples;
5557 -+ unsigned long ttime_mean;
5558 -+};
5559 -+
5560 -+/**
5561 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
5562 -+ * @icq: associated io_cq structure
5563 -+ * @bfqq: array of two process queues, the sync and the async
5564 -+ * @ttime: associated @bfq_ttime struct
5565 -+ */
5566 -+struct bfq_io_cq {
5567 -+ struct io_cq icq; /* must be the first member */
5568 -+ struct bfq_queue *bfqq[2];
5569 -+ struct bfq_ttime ttime;
5570 -+ int ioprio;
5571 -+};
5572 -+
5573 -+/**
5574 -+ * struct bfq_data - per device data structure.
5575 -+ * @queue: request queue for the managed device.
5576 -+ * @root_group: root bfq_group for the device.
5577 -+ * @rq_pos_tree: rbtree sorted by next_request position,
5578 -+ * used when determining if two or more queues
5579 -+ * have interleaving requests (see bfq_close_cooperator).
5580 -+ * @busy_queues: number of bfq_queues containing requests (including the
5581 -+ * queue under service, even if it is idling).
5582 -+ * @queued: number of queued requests.
5583 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
5584 -+ * @sync_flight: number of sync requests in the driver.
5585 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5586 -+ * completed requests .
5587 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5588 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5589 -+ * @budgets_assigned: number of budgets assigned.
5590 -+ * @idle_slice_timer: timer set when idling for the next sequential request
5591 -+ * from the queue under service.
5592 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
5593 -+ * @active_queue: bfq_queue under service.
5594 -+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
5595 -+ * @last_position: on-disk position of the last served request.
5596 -+ * @last_budget_start: beginning of the last budget.
5597 -+ * @last_idling_start: beginning of the last idle slice.
5598 -+ * @peak_rate: peak transfer rate observed for a budget.
5599 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5600 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5601 -+ * @group_list: list of all the bfq_groups active on the device.
5602 -+ * @active_list: list of all the bfq_queues active on the device.
5603 -+ * @idle_list: list of all the bfq_queues idle on the device.
5604 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
5605 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5606 -+ * requests are served in fifo order.
5607 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5608 -+ * @bfq_back_max: maximum allowed backward seek.
5609 -+ * @bfq_slice_idle: maximum idling time.
5610 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5611 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5612 -+ * async queues.
5613 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5614 -+ * to prevent seeky queues to impose long latencies to well
5615 -+ * behaved ones (this also implies that seeky queues cannot
5616 -+ * receive guarantees in the service domain; after a timeout
5617 -+ * they are charged for the whole allocated budget, to try
5618 -+ * to preserve a behavior reasonably fair among them, but
5619 -+ * without service-domain guarantees).
5620 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5621 -+ * queue is multiplied
5622 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5623 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5624 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5625 -+ * may be reactivated for a queue (in jiffies)
5626 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5627 -+ * after which weight-raising may be
5628 -+ * reactivated for an already busy queue
5629 -+ * (in jiffies)
5630 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5631 -+ * sectors per seconds
5632 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
5633 -+ * duration of the weight raising automatically
5634 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5635 -+ *
5636 -+ * All the fields are protected by the @queue lock.
5637 -+ */
5638 -+struct bfq_data {
5639 -+ struct request_queue *queue;
5640 -+
5641 -+ struct bfq_group *root_group;
5642 -+
5643 -+ struct rb_root rq_pos_tree;
5644 -+
5645 -+ int busy_queues;
5646 -+ int queued;
5647 -+ int rq_in_driver;
5648 -+ int sync_flight;
5649 -+
5650 -+ int max_rq_in_driver;
5651 -+ int hw_tag_samples;
5652 -+ int hw_tag;
5653 -+
5654 -+ int budgets_assigned;
5655 -+
5656 -+ struct timer_list idle_slice_timer;
5657 -+ struct work_struct unplug_work;
5658 -+
5659 -+ struct bfq_queue *active_queue;
5660 -+ struct bfq_io_cq *active_bic;
5661 -+
5662 -+ sector_t last_position;
5663 -+
5664 -+ ktime_t last_budget_start;
5665 -+ ktime_t last_idling_start;
5666 -+ int peak_rate_samples;
5667 -+ u64 peak_rate;
5668 -+ unsigned long bfq_max_budget;
5669 -+
5670 -+ struct hlist_head group_list;
5671 -+ struct list_head active_list;
5672 -+ struct list_head idle_list;
5673 -+
5674 -+ unsigned int bfq_quantum;
5675 -+ unsigned int bfq_fifo_expire[2];
5676 -+ unsigned int bfq_back_penalty;
5677 -+ unsigned int bfq_back_max;
5678 -+ unsigned int bfq_slice_idle;
5679 -+ u64 bfq_class_idle_last_service;
5680 -+
5681 -+ unsigned int bfq_user_max_budget;
5682 -+ unsigned int bfq_max_budget_async_rq;
5683 -+ unsigned int bfq_timeout[2];
5684 -+
5685 -+ bool low_latency;
5686 -+
5687 -+ /* parameters of the low_latency heuristics */
5688 -+ unsigned int bfq_raising_coeff;
5689 -+ unsigned int bfq_raising_max_time;
5690 -+ unsigned int bfq_raising_rt_max_time;
5691 -+ unsigned int bfq_raising_min_idle_time;
5692 -+ unsigned int bfq_raising_min_inter_arr_async;
5693 -+ unsigned int bfq_raising_max_softrt_rate;
5694 -+ u64 RT_prod;
5695 -+
5696 -+ struct bfq_queue oom_bfqq;
5697 -+};
5698 -+
5699 -+enum bfqq_state_flags {
5700 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
5701 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
5702 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
5703 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
5704 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
5705 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
5706 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
5707 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
5708 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
5709 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
5710 -+ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
5711 -+};
5712 -+
5713 -+#define BFQ_BFQQ_FNS(name) \
5714 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
5715 -+{ \
5716 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
5717 -+} \
5718 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
5719 -+{ \
5720 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
5721 -+} \
5722 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
5723 -+{ \
5724 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
5725 -+}
5726 -+
5727 -+BFQ_BFQQ_FNS(busy);
5728 -+BFQ_BFQQ_FNS(wait_request);
5729 -+BFQ_BFQQ_FNS(must_alloc);
5730 -+BFQ_BFQQ_FNS(fifo_expire);
5731 -+BFQ_BFQQ_FNS(idle_window);
5732 -+BFQ_BFQQ_FNS(prio_changed);
5733 -+BFQ_BFQQ_FNS(sync);
5734 -+BFQ_BFQQ_FNS(budget_new);
5735 -+BFQ_BFQQ_FNS(coop);
5736 -+BFQ_BFQQ_FNS(split_coop);
5737 -+BFQ_BFQQ_FNS(some_coop_idle);
5738 -+#undef BFQ_BFQQ_FNS
5739 -+
5740 -+/* Logging facilities. */
5741 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
5742 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
5743 -+
5744 -+#define bfq_log(bfqd, fmt, args...) \
5745 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
5746 -+
5747 -+/* Expiration reasons. */
5748 -+enum bfqq_expiration {
5749 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
5750 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
5751 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
5752 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
5753 -+};
5754 -+
5755 -+#ifdef CONFIG_CGROUP_BFQIO
5756 -+/**
5757 -+ * struct bfq_group - per (device, cgroup) data structure.
5758 -+ * @entity: schedulable entity to insert into the parent group sched_data.
5759 -+ * @sched_data: own sched_data, to contain child entities (they may be
5760 -+ * both bfq_queues and bfq_groups).
5761 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
5762 -+ * list of the containing cgroup's bfqio_cgroup.
5763 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
5764 -+ * of the groups active on the same device; used for cleanup.
5765 -+ * @bfqd: the bfq_data for the device this group acts upon.
5766 -+ * @async_bfqq: array of async queues for all the tasks belonging to
5767 -+ * the group, one queue per ioprio value per ioprio_class,
5768 -+ * except for the idle class that has only one queue.
5769 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
5770 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
5771 -+ * to avoid too many special cases during group creation/migration.
5772 -+ *
5773 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
5774 -+ * there is a set of bfq_groups, each one collecting the lower-level
5775 -+ * entities belonging to the group that are acting on the same device.
5776 -+ *
5777 -+ * Locking works as follows:
5778 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
5779 -+ * via RCU from its readers.
5780 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
5781 -+ * from the readers.
5782 -+ * o All the other fields are protected by the @bfqd queue lock.
5783 -+ */
5784 -+struct bfq_group {
5785 -+ struct bfq_entity entity;
5786 -+ struct bfq_sched_data sched_data;
5787 -+
5788 -+ struct hlist_node group_node;
5789 -+ struct hlist_node bfqd_node;
5790 -+
5791 -+ void *bfqd;
5792 -+
5793 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5794 -+ struct bfq_queue *async_idle_bfqq;
5795 -+
5796 -+ struct bfq_entity *my_entity;
5797 -+};
5798 -+
5799 -+/**
5800 -+ * struct bfqio_cgroup - bfq cgroup data structure.
5801 -+ * @css: subsystem state for bfq in the containing cgroup.
5802 -+ * @weight: cgroup weight.
5803 -+ * @ioprio: cgroup ioprio.
5804 -+ * @ioprio_class: cgroup ioprio_class.
5805 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
5806 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
5807 -+ *
5808 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
5809 -+ * @ioprio and @ioprio_class are protected by @lock.
5810 -+ */
5811 -+struct bfqio_cgroup {
5812 -+ struct cgroup_subsys_state css;
5813 -+
5814 -+ unsigned short weight, ioprio, ioprio_class;
5815 -+
5816 -+ spinlock_t lock;
5817 -+ struct hlist_head group_data;
5818 -+};
5819 -+#else
5820 -+struct bfq_group {
5821 -+ struct bfq_sched_data sched_data;
5822 -+
5823 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5824 -+ struct bfq_queue *async_idle_bfqq;
5825 -+};
5826 -+#endif
5827 -+
5828 -+static inline struct bfq_service_tree *
5829 -+bfq_entity_service_tree(struct bfq_entity *entity)
5830 -+{
5831 -+ struct bfq_sched_data *sched_data = entity->sched_data;
5832 -+ unsigned int idx = entity->ioprio_class - 1;
5833 -+
5834 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
5835 -+ BUG_ON(sched_data == NULL);
5836 -+
5837 -+ return sched_data->service_tree + idx;
5838 -+}
5839 -+
5840 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
5841 -+ int is_sync)
5842 -+{
5843 -+ return bic->bfqq[!!is_sync];
5844 -+}
5845 -+
5846 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
5847 -+ struct bfq_queue *bfqq, int is_sync)
5848 -+{
5849 -+ bic->bfqq[!!is_sync] = bfqq;
5850 -+}
5851 -+
5852 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
5853 -+{
5854 -+ return bic->icq.q->elevator->elevator_data;
5855 -+}
5856 -+
5857 -+/**
5858 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
5859 -+ * @ptr: a pointer to a bfqd.
5860 -+ * @flags: storage for the flags to be saved.
5861 -+ *
5862 -+ * This function allows bfqg->bfqd to be protected by the
5863 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
5864 -+ * under RCU, so the storage for bfqd is assured to be safe as long
5865 -+ * as the RCU read side critical section does not end. After the
5866 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
5867 -+ * sure that no other writer accessed it. If we raced with a writer,
5868 -+ * the function returns NULL, with the queue unlocked, otherwise it
5869 -+ * returns the dereferenced pointer, with the queue locked.
5870 -+ */
5871 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
5872 -+ unsigned long *flags)
5873 -+{
5874 -+ struct bfq_data *bfqd;
5875 -+
5876 -+ rcu_read_lock();
5877 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
5878 -+
5879 -+ if (bfqd != NULL) {
5880 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
5881 -+ if (*ptr == bfqd)
5882 -+ goto out;
5883 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5884 -+ }
5885 -+
5886 -+ bfqd = NULL;
5887 -+out:
5888 -+ rcu_read_unlock();
5889 -+ return bfqd;
5890 -+}
5891 -+
5892 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
5893 -+ unsigned long *flags)
5894 -+{
5895 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5896 -+}
5897 -+
5898 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
5899 -+static void bfq_put_queue(struct bfq_queue *bfqq);
5900 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
5901 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
5902 -+ struct bfq_group *bfqg, int is_sync,
5903 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
5904 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
5905 -+ struct bfq_group *bfqg);
5906 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
5907 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
5908 -+#endif
5909 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
5910 -index ffa1d1f..e5e6b0d 100644
5911 ---- a/include/linux/cgroup_subsys.h
5912 -+++ b/include/linux/cgroup_subsys.h
5913 -@@ -85,7 +85,7 @@ SUBSYS(bcache)
5914 -
5915 - /* */
5916 -
5917 --#ifdef CONFIG_CGROUP_BFQIO
5918 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
5919 - SUBSYS(bfqio)
5920 - #endif
5921 -
5922 ---
5923 -1.8.1.4
5924 -
5925
5926 Deleted: genpatches-2.6/trunk/3.11/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.10.0.patch1
5927 ===================================================================
5928 --- genpatches-2.6/trunk/3.11/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.10.0.patch1 2013-09-02 23:07:59 UTC (rev 2507)
5929 +++ genpatches-2.6/trunk/3.11/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.10.0.patch1 2013-09-02 23:10:55 UTC (rev 2508)
5930 @@ -1,1049 +0,0 @@
5931 -From 9204dcb026a40cd2cb4310fecf788924d0fbec8d Mon Sep 17 00:00:00 2001
5932 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
5933 -Date: Fri, 14 Jun 2013 13:46:47 +0200
5934 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for
5935 - 3.10.0
5936 -
5937 -A set of processes may happen to perform interleaved reads, i.e., requests
5938 -whose union would give rise to a sequential read pattern. There are two
5939 -typical cases: in the first case, processes read fixed-size chunks of
5940 -data at a fixed distance from each other, while in the second case processes
5941 -may read variable-size chunks at variable distances. The latter case occurs
5942 -for example with KVM, which splits the I/O generated by the guest into
5943 -multiple chunks, and lets these chunks be served by a pool of cooperating
5944 -processes, iteratively assigning the next chunk of I/O to the first
5945 -available process. CFQ uses actual queue merging for the first type of
5946 -processes, whereas it uses preemption to get a sequential read pattern out
5947 -of the read requests performed by the second type of processes. In the end
5948 -it uses two different mechanisms to achieve the same goal: boosting the
5949 -throughput with interleaved I/O.
5950 -
5951 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
5952 -sequential read pattern with both types of processes. The main idea is
5953 -checking newly arrived requests against the next request of the active queue
5954 -both in case of actual request insert and in case of request merge. By doing
5955 -so, both the types of processes can be handled by just merging their queues.
5956 -EQM is then simpler and more compact than the pair of mechanisms used in
5957 -CFQ.
5958 -
5959 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
5960 -properly restoring the weight-raising state of a queue when it gets back to
5961 -a non-merged state.
5962 -
5963 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
5964 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
5965 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
5966 ----
5967 - block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------
5968 - block/bfq-sched.c | 28 ---
5969 - block/bfq.h | 16 ++
5970 - 3 files changed, 466 insertions(+), 231 deletions(-)
5971 -
5972 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
5973 -index b230927..bc57923 100644
5974 ---- a/block/bfq-iosched.c
5975 -+++ b/block/bfq-iosched.c
5976 -@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
5977 - return dur;
5978 - }
5979 -
5980 -+static inline void
5981 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
5982 -+{
5983 -+ if (bic->saved_idle_window)
5984 -+ bfq_mark_bfqq_idle_window(bfqq);
5985 -+ else
5986 -+ bfq_clear_bfqq_idle_window(bfqq);
5987 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
5988 -+ /*
5989 -+ * Start a weight raising period with the duration given by
5990 -+ * the raising_time_left snapshot.
5991 -+ */
5992 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
5993 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
5994 -+ bfqq->last_rais_start_finish = jiffies;
5995 -+ }
5996 -+ /*
5997 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
5998 -+ * getting confused about the queue's need of a weight-raising
5999 -+ * period.
6000 -+ */
6001 -+ bic->raising_time_left = 0;
6002 -+}
6003 -+
6004 -+/*
6005 -+ * Must be called with the queue_lock held.
6006 -+ */
6007 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
6008 -+{
6009 -+ int process_refs, io_refs;
6010 -+
6011 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
6012 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
6013 -+ BUG_ON(process_refs < 0);
6014 -+ return process_refs;
6015 -+}
6016 -+
6017 - static void bfq_add_rq_rb(struct request *rq)
6018 - {
6019 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
6020 -@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)
6021 - if (! bfqd->low_latency)
6022 - goto add_bfqq_busy;
6023 -
6024 -+ if (bfq_bfqq_just_split(bfqq))
6025 -+ goto set_ioprio_changed;
6026 -+
6027 - /*
6028 -- * If the queue is not being boosted and has been idle
6029 -- * for enough time, start a weight-raising period
6030 -+ * If the queue:
6031 -+ * - is not being boosted,
6032 -+ * - has been idle for enough time,
6033 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
6034 -+ * shared "for its nature" or it is not shared and its
6035 -+ * requests have not been redirected to a shared queue)
6036 -+ * start a weight-raising period.
6037 - */
6038 -- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
6039 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
6040 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
6041 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
6042 - if (idle_for_long_time)
6043 - bfqq->raising_cur_max_time =
6044 -@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)
6045 - raising_cur_max_time));
6046 - }
6047 - }
6048 -+set_ioprio_changed:
6049 - if (old_raising_coeff != bfqq->raising_coeff)
6050 - entity->ioprio_changed = 1;
6051 - add_bfqq_busy:
6052 -@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
6053 - spin_unlock_irq(bfqd->queue->queue_lock);
6054 - }
6055 -
6056 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6057 -- struct bio *bio)
6058 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
6059 - {
6060 -- struct bfq_data *bfqd = q->elevator->elevator_data;
6061 -- struct bfq_io_cq *bic;
6062 -- struct bfq_queue *bfqq;
6063 --
6064 -- /*
6065 -- * Disallow merge of a sync bio into an async request.
6066 -- */
6067 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6068 -- return 0;
6069 --
6070 -- /*
6071 -- * Lookup the bfqq that this bio will be queued with. Allow
6072 -- * merge only if rq is queued there.
6073 -- * Queue lock is held here.
6074 -- */
6075 -- bic = bfq_bic_lookup(bfqd, current->io_context);
6076 -- if (bic == NULL)
6077 -- return 0;
6078 --
6079 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6080 -- return bfqq == RQ_BFQQ(rq);
6081 --}
6082 --
6083 --static void __bfq_set_active_queue(struct bfq_data *bfqd,
6084 -- struct bfq_queue *bfqq)
6085 --{
6086 -- if (bfqq != NULL) {
6087 -- bfq_mark_bfqq_must_alloc(bfqq);
6088 -- bfq_mark_bfqq_budget_new(bfqq);
6089 -- bfq_clear_bfqq_fifo_expire(bfqq);
6090 --
6091 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6092 --
6093 -- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6094 -- bfqq->entity.budget);
6095 -- }
6096 --
6097 -- bfqd->active_queue = bfqq;
6098 --}
6099 --
6100 --/*
6101 -- * Get and set a new active queue for service.
6102 -- */
6103 --static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
6104 -- struct bfq_queue *bfqq)
6105 --{
6106 -- if (!bfqq)
6107 -- bfqq = bfq_get_next_queue(bfqd);
6108 -+ if (request)
6109 -+ return blk_rq_pos(io_struct);
6110 - else
6111 -- bfq_get_next_queue_forced(bfqd, bfqq);
6112 --
6113 -- __bfq_set_active_queue(bfqd, bfqq);
6114 -- return bfqq;
6115 -+ return ((struct bio *)io_struct)->bi_sector;
6116 - }
6117 -
6118 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
6119 -- struct request *rq)
6120 -+static inline sector_t bfq_dist_from(sector_t pos1,
6121 -+ sector_t pos2)
6122 - {
6123 -- if (blk_rq_pos(rq) >= bfqd->last_position)
6124 -- return blk_rq_pos(rq) - bfqd->last_position;
6125 -+ if (pos1 >= pos2)
6126 -+ return pos1 - pos2;
6127 - else
6128 -- return bfqd->last_position - blk_rq_pos(rq);
6129 -+ return pos2 - pos1;
6130 - }
6131 -
6132 --/*
6133 -- * Return true if bfqq has no request pending and rq is close enough to
6134 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
6135 -- * bfqq->next_rq
6136 -- */
6137 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
6138 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
6139 -+ sector_t sector)
6140 - {
6141 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
6142 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
6143 -+ BFQQ_SEEK_THR;
6144 - }
6145 -
6146 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6147 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
6148 - {
6149 - struct rb_root *root = &bfqd->rq_pos_tree;
6150 - struct rb_node *parent, *node;
6151 - struct bfq_queue *__bfqq;
6152 -- sector_t sector = bfqd->last_position;
6153 -
6154 - if (RB_EMPTY_ROOT(root))
6155 - return NULL;
6156 -@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6157 - * position).
6158 - */
6159 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
6160 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6161 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6162 - return __bfqq;
6163 -
6164 - if (blk_rq_pos(__bfqq->next_rq) < sector)
6165 -@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6166 - return NULL;
6167 -
6168 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
6169 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6170 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6171 - return __bfqq;
6172 -
6173 - return NULL;
6174 -@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6175 - /*
6176 - * bfqd - obvious
6177 - * cur_bfqq - passed in so that we don't decide that the current queue
6178 -- * is closely cooperating with itself.
6179 -- *
6180 -- * We are assuming that cur_bfqq has dispatched at least one request,
6181 -- * and that bfqd->last_position reflects a position on the disk associated
6182 -- * with the I/O issued by cur_bfqq.
6183 -+ * is closely cooperating with itself
6184 -+ * sector - used as a reference point to search for a close queue
6185 - */
6186 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6187 -- struct bfq_queue *cur_bfqq)
6188 -+ struct bfq_queue *cur_bfqq,
6189 -+ sector_t sector)
6190 - {
6191 - struct bfq_queue *bfqq;
6192 -
6193 -@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6194 - * working closely on the same area of the disk. In that case,
6195 - * we can group them together and don't waste time idling.
6196 - */
6197 -- bfqq = bfqq_close(bfqd);
6198 -+ bfqq = bfqq_close(bfqd, sector);
6199 - if (bfqq == NULL || bfqq == cur_bfqq)
6200 - return NULL;
6201 -
6202 -@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6203 - return bfqq;
6204 - }
6205 -
6206 -+static struct bfq_queue *
6207 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6208 -+{
6209 -+ int process_refs, new_process_refs;
6210 -+ struct bfq_queue *__bfqq;
6211 -+
6212 -+ /*
6213 -+ * If there are no process references on the new_bfqq, then it is
6214 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6215 -+ * may have dropped their last reference (not just their last process
6216 -+ * reference).
6217 -+ */
6218 -+ if (!bfqq_process_refs(new_bfqq))
6219 -+ return NULL;
6220 -+
6221 -+ /* Avoid a circular list and skip interim queue merges. */
6222 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
6223 -+ if (__bfqq == bfqq)
6224 -+ return NULL;
6225 -+ new_bfqq = __bfqq;
6226 -+ }
6227 -+
6228 -+ process_refs = bfqq_process_refs(bfqq);
6229 -+ new_process_refs = bfqq_process_refs(new_bfqq);
6230 -+ /*
6231 -+ * If the process for the bfqq has gone away, there is no
6232 -+ * sense in merging the queues.
6233 -+ */
6234 -+ if (process_refs == 0 || new_process_refs == 0)
6235 -+ return NULL;
6236 -+
6237 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6238 -+ new_bfqq->pid);
6239 -+
6240 -+ /*
6241 -+ * Merging is just a redirection: the requests of the process owning
6242 -+ * one of the two queues are redirected to the other queue. The latter
6243 -+ * queue, in its turn, is set as shared if this is the first time that
6244 -+ * the requests of some process are redirected to it.
6245 -+ *
6246 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
6247 -+ * are in the context of the process owning bfqq, hence we have the
6248 -+ * io_cq of this process. So we can immediately configure this io_cq
6249 -+ * to redirect the requests of the process to new_bfqq.
6250 -+ *
6251 -+ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
6252 -+ * new_bfqq is not available, because, if the active queue is shared,
6253 -+ * bfqd->active_bic may not point to the io_cq of the active queue.
6254 -+ * Redirecting the requests of the process owning bfqq to the currently
6255 -+ * active queue is in any case the best option, as we feed the active queue
6256 -+ * with new requests close to the last request served and, by doing so,
6257 -+ * hopefully increase the throughput.
6258 -+ */
6259 -+ bfqq->new_bfqq = new_bfqq;
6260 -+ atomic_add(process_refs, &new_bfqq->ref);
6261 -+ return new_bfqq;
6262 -+}
6263 -+
6264 -+/*
6265 -+ * Attempt to schedule a merge of bfqq with the currently active queue or
6266 -+ * with a close queue among the scheduled queues.
6267 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
6268 -+ * structure otherwise.
6269 -+ */
6270 -+static struct bfq_queue *
6271 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6272 -+ void *io_struct, bool request)
6273 -+{
6274 -+ struct bfq_queue *active_bfqq, *new_bfqq;
6275 -+
6276 -+ if (bfqq->new_bfqq)
6277 -+ return bfqq->new_bfqq;
6278 -+
6279 -+ if (!io_struct)
6280 -+ return NULL;
6281 -+
6282 -+ active_bfqq = bfqd->active_queue;
6283 -+
6284 -+ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
6285 -+ goto check_scheduled;
6286 -+
6287 -+ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
6288 -+ goto check_scheduled;
6289 -+
6290 -+ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
6291 -+ goto check_scheduled;
6292 -+
6293 -+ if (active_bfqq->entity.parent != bfqq->entity.parent)
6294 -+ goto check_scheduled;
6295 -+
6296 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
6297 -+ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
6298 -+ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
6299 -+ return new_bfqq; /* Merge with the active queue */
6300 -+
6301 -+ /*
6302 -+ * Check whether there is a cooperator among currently scheduled
6303 -+ * queues. The only thing we need is that the bio/request is not
6304 -+ * NULL, as we need it to establish whether a cooperator exists.
6305 -+ */
6306 -+check_scheduled:
6307 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
6308 -+ bfq_io_struct_pos(io_struct, request));
6309 -+ if (new_bfqq)
6310 -+ return bfq_setup_merge(bfqq, new_bfqq);
6311 -+
6312 -+ return NULL;
6313 -+}
6314 -+
6315 -+static inline void
6316 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
6317 -+{
6318 -+ /*
6319 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
6320 -+ * have already been redirected to a shared queue; both idle window
6321 -+ * and weight raising state have already been saved. Do nothing.
6322 -+ */
6323 -+ if (bfqq->bic == NULL)
6324 -+ return;
6325 -+ if (bfqq->bic->raising_time_left)
6326 -+ /*
6327 -+ * This is the queue of a just-started process, and would
6328 -+ * deserve weight raising: we set raising_time_left to the full
6329 -+ * weight-raising duration to trigger weight-raising when and
6330 -+ * if the queue is split and the first request of the queue
6331 -+ * is enqueued.
6332 -+ */
6333 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
6334 -+ else if (bfqq->raising_coeff > 1) {
6335 -+ unsigned long wrais_duration =
6336 -+ jiffies - bfqq->last_rais_start_finish;
6337 -+ /*
6338 -+ * It may happen that a queue's weight raising period lasts
6339 -+ * longer than its raising_cur_max_time, as weight raising is
6340 -+ * handled only when a request is enqueued or dispatched (it
6341 -+ * does not use any timer). If the weight raising period is
6342 -+ * about to end, don't save it.
6343 -+ */
6344 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
6345 -+ bfqq->bic->raising_time_left = 0;
6346 -+ else
6347 -+ bfqq->bic->raising_time_left =
6348 -+ bfqq->raising_cur_max_time - wrais_duration;
6349 -+ /*
6350 -+ * The bfq_queue is becoming shared or the requests of the
6351 -+ * process owning the queue are being redirected to a shared
6352 -+ * queue. Stop the weight raising period of the queue, as in
6353 -+ * both cases it should not be owned by an interactive or soft
6354 -+ * real-time application.
6355 -+ */
6356 -+ bfq_bfqq_end_raising(bfqq);
6357 -+ } else
6358 -+ bfqq->bic->raising_time_left = 0;
6359 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
6360 -+}
6361 -+
6362 -+static inline void
6363 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
6364 -+{
6365 -+ /*
6366 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
6367 -+ * is about to begin using a shared bfq_queue.
6368 -+ */
6369 -+ if (bfqq->bic)
6370 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
6371 -+}
6372 -+
6373 -+static void
6374 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6375 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6376 -+{
6377 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6378 -+ (long unsigned)new_bfqq->pid);
6379 -+ /* Save weight raising and idle window of the merged queues */
6380 -+ bfq_bfqq_save_state(bfqq);
6381 -+ bfq_bfqq_save_state(new_bfqq);
6382 -+ /*
6383 -+ * Grab a reference to the bic, to prevent it from being destroyed
6384 -+ * before being possibly touched by a bfq_split_bfqq().
6385 -+ */
6386 -+ bfq_get_bic_reference(bfqq);
6387 -+ bfq_get_bic_reference(new_bfqq);
6388 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
6389 -+ bic_set_bfqq(bic, new_bfqq, 1);
6390 -+ bfq_mark_bfqq_coop(new_bfqq);
6391 -+ /*
6392 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
6393 -+ * new_bfqq->bic to NULL. bfqq either:
6394 -+ * - does not belong to any bic any more, and hence bfqq->bic must
6395 -+ * be set to NULL, or
6396 -+ * - is a queue whose owning bics have already been redirected to a
6397 -+ * different queue, hence the queue is destined to not belong to any
6398 -+ * bic soon and bfqq->bic is already NULL (therefore the next
6399 -+ * assignment causes no harm).
6400 -+ */
6401 -+ new_bfqq->bic = NULL;
6402 -+ bfqq->bic = NULL;
6403 -+ bfq_put_queue(bfqq);
6404 -+}
6405 -+
6406 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6407 -+ struct bio *bio)
6408 -+{
6409 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
6410 -+ struct bfq_io_cq *bic;
6411 -+ struct bfq_queue *bfqq, *new_bfqq;
6412 -+
6413 -+ /*
6414 -+ * Disallow merge of a sync bio into an async request.
6415 -+ */
6416 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6417 -+ return 0;
6418 -+
6419 -+ /*
6420 -+ * Lookup the bfqq that this bio will be queued with. Allow
6421 -+ * merge only if rq is queued there.
6422 -+ * Queue lock is held here.
6423 -+ */
6424 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
6425 -+ if (bic == NULL)
6426 -+ return 0;
6427 -+
6428 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6429 -+ /*
6430 -+ * We take advantage of this function to perform an early merge
6431 -+ * of the queues of possible cooperating processes.
6432 -+ */
6433 -+ if (bfqq != NULL &&
6434 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
6435 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
6436 -+ /*
6437 -+ * If we get here, the bio will be queued in the shared queue,
6438 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
6439 -+ * rq can be merged.
6440 -+ */
6441 -+ bfqq = new_bfqq;
6442 -+ }
6443 -+
6444 -+ return bfqq == RQ_BFQQ(rq);
6445 -+}
6446 -+
6447 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
6448 -+ struct bfq_queue *bfqq)
6449 -+{
6450 -+ if (bfqq != NULL) {
6451 -+ bfq_mark_bfqq_must_alloc(bfqq);
6452 -+ bfq_mark_bfqq_budget_new(bfqq);
6453 -+ bfq_clear_bfqq_fifo_expire(bfqq);
6454 -+
6455 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6456 -+
6457 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6458 -+ bfqq->entity.budget);
6459 -+ }
6460 -+
6461 -+ bfqd->active_queue = bfqq;
6462 -+}
6463 -+
6464 -+/*
6465 -+ * Get and set a new active queue for service.
6466 -+ */
6467 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
6468 -+{
6469 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
6470 -+
6471 -+ __bfq_set_active_queue(bfqd, bfqq);
6472 -+ return bfqq;
6473 -+}
6474 -+
6475 - /*
6476 - * If enough samples have been computed, return the current max budget
6477 - * stored in bfqd, which is dynamically updated according to the
6478 -@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
6479 - return rq;
6480 - }
6481 -
6482 --/*
6483 -- * Must be called with the queue_lock held.
6484 -- */
6485 --static int bfqq_process_refs(struct bfq_queue *bfqq)
6486 --{
6487 -- int process_refs, io_refs;
6488 --
6489 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
6490 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
6491 -- BUG_ON(process_refs < 0);
6492 -- return process_refs;
6493 --}
6494 --
6495 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6496 --{
6497 -- int process_refs, new_process_refs;
6498 -- struct bfq_queue *__bfqq;
6499 --
6500 -- /*
6501 -- * If there are no process references on the new_bfqq, then it is
6502 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6503 -- * may have dropped their last reference (not just their last process
6504 -- * reference).
6505 -- */
6506 -- if (!bfqq_process_refs(new_bfqq))
6507 -- return;
6508 --
6509 -- /* Avoid a circular list and skip interim queue merges. */
6510 -- while ((__bfqq = new_bfqq->new_bfqq)) {
6511 -- if (__bfqq == bfqq)
6512 -- return;
6513 -- new_bfqq = __bfqq;
6514 -- }
6515 --
6516 -- process_refs = bfqq_process_refs(bfqq);
6517 -- new_process_refs = bfqq_process_refs(new_bfqq);
6518 -- /*
6519 -- * If the process for the bfqq has gone away, there is no
6520 -- * sense in merging the queues.
6521 -- */
6522 -- if (process_refs == 0 || new_process_refs == 0)
6523 -- return;
6524 --
6525 -- /*
6526 -- * Merge in the direction of the lesser amount of work.
6527 -- */
6528 -- if (new_process_refs >= process_refs) {
6529 -- bfqq->new_bfqq = new_bfqq;
6530 -- atomic_add(process_refs, &new_bfqq->ref);
6531 -- } else {
6532 -- new_bfqq->new_bfqq = bfqq;
6533 -- atomic_add(new_process_refs, &bfqq->ref);
6534 -- }
6535 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6536 -- new_bfqq->pid);
6537 --}
6538 --
6539 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
6540 - {
6541 - struct bfq_entity *entity = &bfqq->entity;
6542 -@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
6543 - * is likely to boost the disk throughput);
6544 - * - the queue is weight-raised (waiting for the request is necessary for
6545 - * providing the queue with fairness and latency guarantees).
6546 -+ *
6547 -+ * In any case, idling can be disabled for cooperation issues, if
6548 -+ * 1) there is a close cooperator for the queue, or
6549 -+ * 2) the queue is shared and some cooperator is likely to be idle (in this
6550 -+ * case, by not arming the idle timer, we try to slow down the queue, to
6551 -+ * prevent the zones of the disk accessed by the active cooperators to
6552 -+ * become too distant from the zone that will be accessed by the currently
6553 -+ * idle cooperators).
6554 - */
6555 - static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6556 - int budg_timeout)
6557 -@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6558 - (bfqd->rq_in_driver == 0 ||
6559 - budg_timeout ||
6560 - bfqq->raising_coeff > 1) &&
6561 -- !bfq_close_cooperator(bfqd, bfqq) &&
6562 -+ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&
6563 - (!bfq_bfqq_coop(bfqq) ||
6564 - !bfq_bfqq_some_coop_idle(bfqq)) &&
6565 - !bfq_queue_nonrot_noidle(bfqd, bfqq));
6566 -@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6567 - */
6568 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6569 - {
6570 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
6571 -+ struct bfq_queue *bfqq;
6572 - struct request *next_rq;
6573 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
6574 - int budg_timeout;
6575 -@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6576 -
6577 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
6578 -
6579 -- /*
6580 -- * If another queue has a request waiting within our mean seek
6581 -- * distance, let it run. The expire code will check for close
6582 -- * cooperators and put the close queue at the front of the
6583 -- * service tree. If possible, merge the expiring queue with the
6584 -- * new bfqq.
6585 -- */
6586 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
6587 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
6588 -- bfq_setup_merge(bfqq, new_bfqq);
6589 --
6590 - budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
6591 - if (budg_timeout &&
6592 - !bfq_bfqq_must_idle(bfqq, budg_timeout))
6593 -@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6594 - bfq_clear_bfqq_wait_request(bfqq);
6595 - del_timer(&bfqd->idle_slice_timer);
6596 - }
6597 -- if (new_bfqq == NULL)
6598 -- goto keep_queue;
6599 -- else
6600 -- goto expire;
6601 -+ goto keep_queue;
6602 - }
6603 - }
6604 -
6605 -@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6606 - * queue still has requests in flight or is idling for a new request,
6607 - * then keep it.
6608 - */
6609 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
6610 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
6611 - (bfqq->dispatched != 0 &&
6612 - (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
6613 -- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
6614 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
6615 - bfqq = NULL;
6616 - goto keep_queue;
6617 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
6618 -- /*
6619 -- * Expiring the queue because there is a close cooperator,
6620 -- * cancel timer.
6621 -- */
6622 -- bfq_clear_bfqq_wait_request(bfqq);
6623 -- del_timer(&bfqd->idle_slice_timer);
6624 - }
6625 -
6626 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
6627 - expire:
6628 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
6629 - new_queue:
6630 -- bfqq = bfq_set_active_queue(bfqd, new_bfqq);
6631 -+ bfqq = bfq_set_active_queue(bfqd);
6632 - bfq_log(bfqd, "select_queue: new queue %d returned",
6633 - bfqq != NULL ? bfqq->pid : 0);
6634 - keep_queue:
6635 -@@ -1617,9 +1807,8 @@ keep_queue:
6636 -
6637 - static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6638 - {
6639 -+ struct bfq_entity *entity = &bfqq->entity;
6640 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
6641 -- struct bfq_entity *entity = &bfqq->entity;
6642 --
6643 - bfq_log_bfqq(bfqd, bfqq,
6644 - "raising period dur %u/%u msec, "
6645 - "old raising coeff %u, w %d(%d)",
6646 -@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6647 - jiffies_to_msecs(bfqq->
6648 - raising_cur_max_time));
6649 - bfq_bfqq_end_raising(bfqq);
6650 -- __bfq_entity_update_weight_prio(
6651 -- bfq_entity_service_tree(entity),
6652 -- entity);
6653 - }
6654 - }
6655 - }
6656 -+ /* Update weight both if it must be raised and if it must be lowered */
6657 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
6658 -+ __bfq_entity_update_weight_prio(
6659 -+ bfq_entity_service_tree(entity),
6660 -+ entity);
6661 - }
6662 -
6663 - /*
6664 -@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)
6665 - struct bfq_io_cq *bic = icq_to_bic(icq);
6666 -
6667 - bic->ttime.last_end_request = jiffies;
6668 -+ /*
6669 -+ * A newly created bic indicates that the process has just
6670 -+ * started doing I/O, and is probably mapping into memory its
6671 -+ * executable and libraries: it definitely needs weight raising.
6672 -+ * There is however the possibility that the process performs,
6673 -+ * for a while, I/O close to some other process. EQM intercepts
6674 -+ * this behavior and may merge the queue corresponding to the
6675 -+ * process with some other queue, BEFORE the weight of the queue
6676 -+ * is raised. Merged queues are not weight-raised (they are assumed
6677 -+ * to belong to processes that benefit only from high throughput).
6678 -+ * If the merge is basically the consequence of an accident, then
6679 -+ * the queue will be split soon and will get back its old weight.
6680 -+ * It is then important to write down somewhere that this queue
6681 -+ * does need weight raising, even if it did not make it to get its
6682 -+ * weight raised before being merged. To this purpose, we overload
6683 -+ * the field raising_time_left and assign 1 to it, to mark the queue
6684 -+ * as needing weight raising.
6685 -+ */
6686 -+ bic->raising_time_left = 1;
6687 - }
6688 -
6689 - static void bfq_exit_icq(struct io_cq *icq)
6690 -@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)
6691 - }
6692 -
6693 - if (bic->bfqq[BLK_RW_SYNC]) {
6694 -+ /*
6695 -+ * If the bic is using a shared queue, put the reference
6696 -+ * taken on the io_context when the bic started using a
6697 -+ * shared bfq_queue.
6698 -+ */
6699 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
6700 -+ put_io_context(icq->ioc);
6701 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
6702 - bic->bfqq[BLK_RW_SYNC] = NULL;
6703 - }
6704 -@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
6705 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
6706 - return;
6707 -
6708 -+ /* Idle window just restored, statistics are meaningless. */
6709 -+ if (bfq_bfqq_just_split(bfqq))
6710 -+ return;
6711 -+
6712 - enable_idle = bfq_bfqq_idle_window(bfqq);
6713 -
6714 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
6715 -@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6716 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
6717 - !BFQQ_SEEKY(bfqq))
6718 - bfq_update_idle_window(bfqd, bfqq, bic);
6719 -+ bfq_clear_bfqq_just_split(bfqq);
6720 -
6721 - bfq_log_bfqq(bfqd, bfqq,
6722 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
6723 -@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6724 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
6725 - {
6726 - struct bfq_data *bfqd = q->elevator->elevator_data;
6727 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
6728 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
6729 -
6730 - assert_spin_locked(bfqd->queue->queue_lock);
6731 -+
6732 -+ /*
6733 -+ * An unplug may trigger a requeue of a request from the device
6734 -+ * driver: make sure we are in process context while trying to
6735 -+ * merge two bfq_queues.
6736 -+ */
6737 -+ if (!in_interrupt() &&
6738 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
6739 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
6740 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
6741 -+ /*
6742 -+ * Release the request's reference to the old bfqq
6743 -+ * and make sure one is taken to the shared queue.
6744 -+ */
6745 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
6746 -+ bfqq->allocated[rq_data_dir(rq)]--;
6747 -+ atomic_inc(&new_bfqq->ref);
6748 -+ bfq_put_queue(bfqq);
6749 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
6750 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
6751 -+ rq->elv.priv[1] = new_bfqq;
6752 -+ bfqq = new_bfqq;
6753 -+ }
6754 -+
6755 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
6756 -
6757 - bfq_add_rq_rb(rq);
6758 -
6759 -+ /*
6760 -+ * Here a newly-created bfq_queue has already started a weight-raising
6761 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
6762 -+ * from assigning it a full weight-raising period. See the detailed
6763 -+ * comments about this field in bfq_init_icq().
6764 -+ */
6765 -+ if (bfqq->bic != NULL)
6766 -+ bfqq->bic->raising_time_left = 0;
6767 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
6768 - list_add_tail(&rq->queuelist, &bfqq->fifo);
6769 -
6770 -@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
6771 - if (bfq_bfqq_budget_new(bfqq))
6772 - bfq_set_budget_timeout(bfqd);
6773 -
6774 -- /* Idling is disabled also for cooperation issues:
6775 -- * 1) there is a close cooperator for the queue, or
6776 -- * 2) the queue is shared and some cooperator is likely
6777 -- * to be idle (in this case, by not arming the idle timer,
6778 -- * we try to slow down the queue, to prevent the zones
6779 -- * of the disk accessed by the active cooperators to become
6780 -- * too distant from the zone that will be accessed by the
6781 -- * currently idle cooperators)
6782 -- */
6783 - if (bfq_bfqq_must_idle(bfqq, budg_timeout))
6784 - bfq_arm_slice_timer(bfqd);
6785 - else if (budg_timeout)
6786 -@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)
6787 - }
6788 - }
6789 -
6790 --static struct bfq_queue *
6791 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6792 -- struct bfq_queue *bfqq)
6793 --{
6794 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6795 -- (long unsigned)bfqq->new_bfqq->pid);
6796 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
6797 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
6798 -- bfq_put_queue(bfqq);
6799 -- return bic_to_bfqq(bic, 1);
6800 --}
6801 --
6802 - /*
6803 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
6804 - * was the last process referring to said bfqq.
6805 -@@ -2469,6 +2702,9 @@ static struct bfq_queue *
6806 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
6807 - {
6808 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
6809 -+
6810 -+ put_io_context(bic->icq.ioc);
6811 -+
6812 - if (bfqq_process_refs(bfqq) == 1) {
6813 - bfqq->pid = current->pid;
6814 - bfq_clear_bfqq_some_coop_idle(bfqq);
6815 -@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
6816 - struct bfq_queue *bfqq;
6817 - struct bfq_group *bfqg;
6818 - unsigned long flags;
6819 -+ bool split = false;
6820 -
6821 - might_sleep_if(gfp_mask & __GFP_WAIT);
6822 -
6823 -@@ -2516,24 +2753,14 @@ new_queue:
6824 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
6825 - bic_set_bfqq(bic, bfqq, is_sync);
6826 - } else {
6827 -- /*
6828 -- * If the queue was seeky for too long, break it apart.
6829 -- */
6830 -+ /* If the queue was seeky for too long, break it apart. */
6831 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
6832 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
6833 - bfqq = bfq_split_bfqq(bic, bfqq);
6834 -+ split = true;
6835 - if (!bfqq)
6836 - goto new_queue;
6837 - }
6838 --
6839 -- /*
6840 -- * Check to see if this queue is scheduled to merge with
6841 -- * another closely cooperating queue. The merging of queues
6842 -- * happens here as it must be done in process context.
6843 -- * The reference on new_bfqq was taken in merge_bfqqs.
6844 -- */
6845 -- if (bfqq->new_bfqq != NULL)
6846 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
6847 - }
6848 -
6849 - bfqq->allocated[rw]++;
6850 -@@ -2544,6 +2771,26 @@ new_queue:
6851 - rq->elv.priv[0] = bic;
6852 - rq->elv.priv[1] = bfqq;
6853 -
6854 -+ /*
6855 -+ * If a bfq_queue has only one process reference, it is owned
6856 -+ * by only one bfq_io_cq: we can set the bic field of the
6857 -+ * bfq_queue to the address of that structure. Also, if the
6858 -+ * queue has just been split, mark a flag so that the
6859 -+ * information is available to the other scheduler hooks.
6860 -+ */
6861 -+ if (bfqq_process_refs(bfqq) == 1) {
6862 -+ bfqq->bic = bic;
6863 -+ if (split) {
6864 -+ bfq_mark_bfqq_just_split(bfqq);
6865 -+ /*
6866 -+ * If the queue has just been split from a shared queue,
6867 -+ * restore the idle window and the possible weight
6868 -+ * raising period.
6869 -+ */
6870 -+ bfq_bfqq_resume_state(bfqq, bic);
6871 -+ }
6872 -+ }
6873 -+
6874 - spin_unlock_irqrestore(q->queue_lock, flags);
6875 -
6876 - return 0;
6877 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
6878 -index 03f8061..a0edaa2 100644
6879 ---- a/block/bfq-sched.c
6880 -+++ b/block/bfq-sched.c
6881 -@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
6882 - return bfqq;
6883 - }
6884 -
6885 --/*
6886 -- * Forced extraction of the given queue.
6887 -- */
6888 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
6889 -- struct bfq_queue *bfqq)
6890 --{
6891 -- struct bfq_entity *entity;
6892 -- struct bfq_sched_data *sd;
6893 --
6894 -- BUG_ON(bfqd->active_queue != NULL);
6895 --
6896 -- entity = &bfqq->entity;
6897 -- /*
6898 -- * Bubble up extraction/update from the leaf to the root.
6899 -- */
6900 -- for_each_entity(entity) {
6901 -- sd = entity->sched_data;
6902 -- bfq_update_budget(entity);
6903 -- bfq_update_vtime(bfq_entity_service_tree(entity));
6904 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
6905 -- sd->active_entity = entity;
6906 -- sd->next_active = NULL;
6907 -- entity->service = 0;
6908 -- }
6909 --
6910 -- return;
6911 --}
6912 --
6913 - static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
6914 - {
6915 - if (bfqd->active_bic != NULL) {
6916 -diff --git a/block/bfq.h b/block/bfq.h
6917 -index 48ecde9..bb52975 100644
6918 ---- a/block/bfq.h
6919 -+++ b/block/bfq.h
6920 -@@ -188,6 +188,8 @@ struct bfq_group;
6921 - * @pid: pid of the process owning the queue, used for logging purposes.
6922 - * @last_rais_start_time: last (idle -> weight-raised) transition attempt
6923 - * @raising_cur_max_time: current max raising time for this queue
6924 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
6925 -+ * queue is shared
6926 - *
6927 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
6928 - * or more (if it is an async one). @cgroup holds a reference to the
6929 -@@ -231,6 +233,7 @@ struct bfq_queue {
6930 - sector_t last_request_pos;
6931 -
6932 - pid_t pid;
6933 -+ struct bfq_io_cq *bic;
6934 -
6935 - /* weight-raising fields */
6936 - unsigned int raising_cur_max_time;
6937 -@@ -257,12 +260,23 @@ struct bfq_ttime {
6938 - * @icq: associated io_cq structure
6939 - * @bfqq: array of two process queues, the sync and the async
6940 - * @ttime: associated @bfq_ttime struct
6941 -+ * @raising_time_left: snapshot of the time left before weight raising ends
6942 -+ * for the sync queue associated to this process; this
6943 -+ * snapshot is taken to remember this value while the weight
6944 -+ * raising is suspended because the queue is merged with a
6945 -+ * shared queue, and is used to set @raising_cur_max_time
6946 -+ * when the queue is split from the shared queue and its
6947 -+ * weight is raised again
6948 -+ * @saved_idle_window: same purpose as the previous field for the idle window
6949 - */
6950 - struct bfq_io_cq {
6951 - struct io_cq icq; /* must be the first member */
6952 - struct bfq_queue *bfqq[2];
6953 - struct bfq_ttime ttime;
6954 - int ioprio;
6955 -+
6956 -+ unsigned int raising_time_left;
6957 -+ unsigned int saved_idle_window;
6958 - };
6959 -
6960 - /**
6961 -@@ -403,6 +417,7 @@ enum bfqq_state_flags {
6962 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6963 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6964 - BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
6965 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
6966 - };
6967 -
6968 - #define BFQ_BFQQ_FNS(name) \
6969 -@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);
6970 - BFQ_BFQQ_FNS(coop);
6971 - BFQ_BFQQ_FNS(split_coop);
6972 - BFQ_BFQQ_FNS(some_coop_idle);
6973 -+BFQ_BFQQ_FNS(just_split);
6974 - #undef BFQ_BFQQ_FNS
6975 -
6976 - /* Logging facilities. */
6977 ---
6978 -1.8.1.4
6979 -