Gentoo Archives: gentoo-commits

From: "Tom Wijsman (tomwij)" <tomwij@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2423 - genpatches-2.6/trunk/3.10
Date: Mon, 01 Jul 2013 07:02:44
Message-Id: 20130701070235.9282C2171C@flycatcher.gentoo.org
1 Author: tomwij
2 Date: 2013-07-01 07:02:35 +0000 (Mon, 01 Jul 2013)
3 New Revision: 2423
4
5 Added:
6 genpatches-2.6/trunk/3.10/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.9.patch
7 genpatches-2.6/trunk/3.10/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.9.patch1
8 genpatches-2.6/trunk/3.10/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.9.0.patch1
9 Modified:
10 genpatches-2.6/trunk/3.10/0000_README
11 Log:
12 Bring missing BFQ patches from 3.9 to 3.10 branch.
13
14 Modified: genpatches-2.6/trunk/3.10/0000_README
15 ===================================================================
16 --- genpatches-2.6/trunk/3.10/0000_README 2013-07-01 01:14:09 UTC (rev 2422)
17 +++ genpatches-2.6/trunk/3.10/0000_README 2013-07-01 07:02:35 UTC (rev 2423)
18 @@ -51,6 +51,18 @@
19 From: https://bugs.gentoo.org/show_bug.cgi?id=462066
20 Desc: Revert memcg patches that prevent OOM with too many dirty pages.
21
22 +Patch: 1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.9.patch
23 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
24 +Desc: BFQ v6r2 patch 1 for 3.9: Build, cgroups and kconfig bits
25 +
26 +Patch: 1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.9.patch1
27 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
28 +Desc: BFQ v6r2 patch 2 for 3.9: BFQ Scheduler
29 +
30 +Patch: 1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.9.0.patch1
31 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
32 +Desc: BFQ v6r2 patch 3 for 3.9: Early Queue Merge (EQM)
33 +
34 Patch: 2400_kcopy-patch-for-infiniband-driver.patch
35 From: Alexey Shvetsov <alexxy@g.o>
36 Desc: Zero copy for infiniband psm userspace driver
37
38 Added: genpatches-2.6/trunk/3.10/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.9.patch
39 ===================================================================
40 --- genpatches-2.6/trunk/3.10/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.9.patch (rev 0)
41 +++ genpatches-2.6/trunk/3.10/1801_block-cgroups-kconfig-build-bits-for-BFQ-v6r2-3.9.patch 2013-07-01 07:02:35 UTC (rev 2423)
42 @@ -0,0 +1,97 @@
43 +From 6946e7e4e53df0836fe13a2a8a750c0d70f66f3d Mon Sep 17 00:00:00 2001
44 +From: Matteo Bernardini <matteo.bernardini@×××××.com>
45 +Date: Thu, 9 May 2013 18:58:50 +0200
46 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.9
47 +
48 +Update Kconfig.iosched and do the related Makefile changes to include
49 +kernel configuration options for BFQ. Also add the bfqio controller
50 +to the cgroups subsystem.
51 +
52 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
53 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
54 +Signed-off-by: Matteo Bernardini <matteo.bernardini@×××××.com>
55 +---
56 + block/Kconfig.iosched | 25 +++++++++++++++++++++++++
57 + block/Makefile | 1 +
58 + include/linux/cgroup_subsys.h | 6 ++++++
59 + 3 files changed, 32 insertions(+)
60 +
61 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
62 +index 421bef9..695e064 100644
63 +--- a/block/Kconfig.iosched
64 ++++ b/block/Kconfig.iosched
65 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
66 + ---help---
67 + Enable group IO scheduling in CFQ.
68 +
69 ++config IOSCHED_BFQ
70 ++ tristate "BFQ I/O scheduler"
71 ++ default n
72 ++ ---help---
73 ++ The BFQ I/O scheduler tries to distribute bandwidth among
74 ++ all processes according to their weights.
75 ++ It aims at distributing the bandwidth as desired, independently of
76 ++ the disk parameters and with any workload. It also tries to
77 ++ guarantee low latency to interactive and soft real-time
78 ++ applications. If compiled built-in (saying Y here), BFQ can
79 ++ be configured to support hierarchical scheduling.
80 ++
81 ++config CGROUP_BFQIO
82 ++ bool "BFQ hierarchical scheduling support"
83 ++ depends on CGROUPS && IOSCHED_BFQ=y
84 ++ default n
85 ++ ---help---
86 ++ Enable hierarchical scheduling in BFQ, using the cgroups
87 ++ filesystem interface. The name of the subsystem will be
88 ++ bfqio.
89 ++
90 + choice
91 + prompt "Default I/O scheduler"
92 + default DEFAULT_CFQ
93 +@@ -52,6 +73,9 @@ choice
94 + config DEFAULT_CFQ
95 + bool "CFQ" if IOSCHED_CFQ=y
96 +
97 ++ config DEFAULT_BFQ
98 ++ bool "BFQ" if IOSCHED_BFQ=y
99 ++
100 + config DEFAULT_NOOP
101 + bool "No-op"
102 +
103 +@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED
104 + string
105 + default "deadline" if DEFAULT_DEADLINE
106 + default "cfq" if DEFAULT_CFQ
107 ++ default "bfq" if DEFAULT_BFQ
108 + default "noop" if DEFAULT_NOOP
109 +
110 + endmenu
111 +diff --git a/block/Makefile b/block/Makefile
112 +index 39b76ba..c0d20fa 100644
113 +--- a/block/Makefile
114 ++++ b/block/Makefile
115 +@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
116 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
117 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
118 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
119 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
120 +
121 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
122 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
123 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
124 +index f204a7a..b999bfa 100644
125 +--- a/include/linux/cgroup_subsys.h
126 ++++ b/include/linux/cgroup_subsys.h
127 +@@ -78,3 +78,9 @@ SUBSYS(hugetlb)
128 + #endif
129 +
130 + /* */
131 ++
132 ++#ifdef CONFIG_CGROUP_BFQIO
133 ++SUBSYS(bfqio)
134 ++#endif
135 ++
136 ++/* */
137 +--
138 +1.8.1.4
139 +
140
141 Added: genpatches-2.6/trunk/3.10/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.9.patch1
142 ===================================================================
143 --- genpatches-2.6/trunk/3.10/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.9.patch1 (rev 0)
144 +++ genpatches-2.6/trunk/3.10/1802_block-introduce-the-BFQ-v6r2-I-O-sched-for-3.9.patch1 2013-07-01 07:02:35 UTC (rev 2423)
145 @@ -0,0 +1,5748 @@
146 +From c85fc6e997b49039c6580a7257b9773777656d8a Mon Sep 17 00:00:00 2001
147 +From: Arianna Avanzini <avanzini.arianna@×××××.com>
148 +Date: Thu, 9 May 2013 19:10:02 +0200
149 +Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.9
150 +
151 +Add the BFQ-v6r2 I/O scheduler to 3.9.
152 +The general structure is borrowed from CFQ, as much code. A (bfq_)queue
153 +is associated to each task doing I/O on a device, and each time a
154 +scheduling decision has to be made a queue is selected and served until
155 +it expires.
156 +
157 + - Slices are given in the service domain: tasks are assigned
158 + budgets, measured in number of sectors. Once got the disk, a task
159 + must however consume its assigned budget within a configurable
160 + maximum time (by default, the maximum possible value of the
161 + budgets is automatically computed to comply with this timeout).
162 + This allows the desired latency vs "throughput boosting" tradeoff
163 + to be set.
164 +
165 + - Budgets are scheduled according to a variant of WF2Q+, implemented
166 + using an augmented rb-tree to take eligibility into account while
167 + preserving an O(log N) overall complexity.
168 +
169 + - A low-latency tunable is provided; if enabled, both interactive
170 + and soft real-time applications are guaranteed very low latency.
171 +
172 + - Latency guarantees are preserved also in presence of NCQ.
173 +
174 + - Also with flash-based devices, a high throughput is achieved while
175 + still preserving latency guarantees.
176 +
177 + - Useful features borrowed from CFQ: cooperating-queues merging (with
178 + some additional optimizations with respect to the original CFQ version),
179 + static fallback queue for OOM.
180 +
181 + - BFQ supports full hierarchical scheduling, exporting a cgroups
182 + interface. Each node has a full scheduler, so each group can
183 + be assigned its own ioprio (mapped to a weight, see next point)
184 + and an ioprio_class.
185 +
186 + - If the cgroups interface is used, weights can be explictly
187 + assigned, otherwise ioprio values are mapped to weights using the
188 + relation weight = IOPRIO_BE_NR - ioprio.
189 +
190 + - ioprio classes are served in strict priority order, i.e., lower
191 + priority queues are not served as long as there are higher
192 + priority queues. Among queues in the same class the bandwidth is
193 + distributed in proportion to the weight of each queue. A very
194 + thin extra bandwidth is however guaranteed to the Idle class, to
195 + prevent it from starving.
196 +
197 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
198 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
199 +---
200 + block/bfq-cgroup.c | 868 +++++++++++++++
201 + block/bfq-ioc.c | 36 +
202 + block/bfq-iosched.c | 3070 +++++++++++++++++++++++++++++++++++++++++++++++++++
203 + block/bfq-sched.c | 1072 ++++++++++++++++++
204 + block/bfq.h | 603 ++++++++++
205 + 5 files changed, 5649 insertions(+)
206 + create mode 100644 block/bfq-cgroup.c
207 + create mode 100644 block/bfq-ioc.c
208 + create mode 100644 block/bfq-iosched.c
209 + create mode 100644 block/bfq-sched.c
210 + create mode 100644 block/bfq.h
211 +
212 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
213 +new file mode 100644
214 +index 0000000..2c569cc
215 +--- /dev/null
216 ++++ b/block/bfq-cgroup.c
217 +@@ -0,0 +1,868 @@
218 ++/*
219 ++ * BFQ: CGROUPS support.
220 ++ *
221 ++ * Based on ideas and code from CFQ:
222 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
223 ++ *
224 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
225 ++ * Paolo Valente <paolo.valente@×××××××.it>
226 ++ *
227 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
228 ++ *
229 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
230 ++ */
231 ++
232 ++#ifdef CONFIG_CGROUP_BFQIO
233 ++static struct bfqio_cgroup bfqio_root_cgroup = {
234 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
235 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
236 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
237 ++};
238 ++
239 ++static inline void bfq_init_entity(struct bfq_entity *entity,
240 ++ struct bfq_group *bfqg)
241 ++{
242 ++ entity->weight = entity->new_weight;
243 ++ entity->orig_weight = entity->new_weight;
244 ++ entity->ioprio = entity->new_ioprio;
245 ++ entity->ioprio_class = entity->new_ioprio_class;
246 ++ entity->parent = bfqg->my_entity;
247 ++ entity->sched_data = &bfqg->sched_data;
248 ++}
249 ++
250 ++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
251 ++{
252 ++ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
253 ++ struct bfqio_cgroup, css);
254 ++}
255 ++
256 ++/*
257 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
258 ++ * of bgrp. Must be called under rcu_read_lock().
259 ++ */
260 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
261 ++ struct bfq_data *bfqd)
262 ++{
263 ++ struct bfq_group *bfqg;
264 ++ void *key;
265 ++
266 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
267 ++ key = rcu_dereference(bfqg->bfqd);
268 ++ if (key == bfqd)
269 ++ return bfqg;
270 ++ }
271 ++
272 ++ return NULL;
273 ++}
274 ++
275 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
276 ++ struct bfq_group *bfqg)
277 ++{
278 ++ struct bfq_entity *entity = &bfqg->entity;
279 ++
280 ++ /*
281 ++ * If the weight of the entity has never been set via the sysfs
282 ++ * interface, then bgrp->weight == 0. In this case we initialize
283 ++ * the weight from the current ioprio value. Otherwise, the group
284 ++ * weight, if set, has priority over the ioprio value.
285 ++ */
286 ++ if (bgrp->weight == 0) {
287 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
288 ++ entity->new_ioprio = bgrp->ioprio;
289 ++ } else {
290 ++ entity->new_weight = bgrp->weight;
291 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
292 ++ }
293 ++ entity->orig_weight = entity->weight = entity->new_weight;
294 ++ entity->ioprio = entity->new_ioprio;
295 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
296 ++ entity->my_sched_data = &bfqg->sched_data;
297 ++}
298 ++
299 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
300 ++ struct bfq_group *parent)
301 ++{
302 ++ struct bfq_entity *entity;
303 ++
304 ++ BUG_ON(parent == NULL);
305 ++ BUG_ON(bfqg == NULL);
306 ++
307 ++ entity = &bfqg->entity;
308 ++ entity->parent = parent->my_entity;
309 ++ entity->sched_data = &parent->sched_data;
310 ++}
311 ++
312 ++/**
313 ++ * bfq_group_chain_alloc - allocate a chain of groups.
314 ++ * @bfqd: queue descriptor.
315 ++ * @cgroup: the leaf cgroup this chain starts from.
316 ++ *
317 ++ * Allocate a chain of groups starting from the one belonging to
318 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
319 ++ * to the root has already an allocated group on @bfqd.
320 ++ */
321 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
322 ++ struct cgroup *cgroup)
323 ++{
324 ++ struct bfqio_cgroup *bgrp;
325 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
326 ++
327 ++ for (; cgroup != NULL; cgroup = cgroup->parent) {
328 ++ bgrp = cgroup_to_bfqio(cgroup);
329 ++
330 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
331 ++ if (bfqg != NULL) {
332 ++ /*
333 ++ * All the cgroups in the path from there to the
334 ++ * root must have a bfq_group for bfqd, so we don't
335 ++ * need any more allocations.
336 ++ */
337 ++ break;
338 ++ }
339 ++
340 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
341 ++ if (bfqg == NULL)
342 ++ goto cleanup;
343 ++
344 ++ bfq_group_init_entity(bgrp, bfqg);
345 ++ bfqg->my_entity = &bfqg->entity;
346 ++
347 ++ if (leaf == NULL) {
348 ++ leaf = bfqg;
349 ++ prev = leaf;
350 ++ } else {
351 ++ bfq_group_set_parent(prev, bfqg);
352 ++ /*
353 ++ * Build a list of allocated nodes using the bfqd
354 ++ * filed, that is still unused and will be initialized
355 ++ * only after the node will be connected.
356 ++ */
357 ++ prev->bfqd = bfqg;
358 ++ prev = bfqg;
359 ++ }
360 ++ }
361 ++
362 ++ return leaf;
363 ++
364 ++cleanup:
365 ++ while (leaf != NULL) {
366 ++ prev = leaf;
367 ++ leaf = leaf->bfqd;
368 ++ kfree(prev);
369 ++ }
370 ++
371 ++ return NULL;
372 ++}
373 ++
374 ++/**
375 ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
376 ++ * @bfqd: the queue descriptor.
377 ++ * @cgroup: the leaf cgroup to start from.
378 ++ * @leaf: the leaf group (to be associated to @cgroup).
379 ++ *
380 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
381 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
382 ++ * hierarchy that already as a group associated to @bfqd all the nodes
383 ++ * in the path to the root cgroup have one too.
384 ++ *
385 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
386 ++ * per device) while the bfqio_cgroup lock protects the list of groups
387 ++ * belonging to the same cgroup.
388 ++ */
389 ++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
390 ++ struct bfq_group *leaf)
391 ++{
392 ++ struct bfqio_cgroup *bgrp;
393 ++ struct bfq_group *bfqg, *next, *prev = NULL;
394 ++ unsigned long flags;
395 ++
396 ++ assert_spin_locked(bfqd->queue->queue_lock);
397 ++
398 ++ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
399 ++ bgrp = cgroup_to_bfqio(cgroup);
400 ++ next = leaf->bfqd;
401 ++
402 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
403 ++ BUG_ON(bfqg != NULL);
404 ++
405 ++ spin_lock_irqsave(&bgrp->lock, flags);
406 ++
407 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
408 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
409 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
410 ++
411 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
412 ++
413 ++ prev = leaf;
414 ++ leaf = next;
415 ++ }
416 ++
417 ++ BUG_ON(cgroup == NULL && leaf != NULL);
418 ++ if (cgroup != NULL && prev != NULL) {
419 ++ bgrp = cgroup_to_bfqio(cgroup);
420 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
421 ++ bfq_group_set_parent(prev, bfqg);
422 ++ }
423 ++}
424 ++
425 ++/**
426 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
427 ++ * @bfqd: queue descriptor.
428 ++ * @cgroup: cgroup being searched for.
429 ++ *
430 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
431 ++ * necessary. When a group is returned all the cgroups in the path
432 ++ * to the root have a group associated to @bfqd.
433 ++ *
434 ++ * If the allocation fails, return the root group: this breaks guarantees
435 ++ * but is a safe fallbak. If this loss becames a problem it can be
436 ++ * mitigated using the equivalent weight (given by the product of the
437 ++ * weights of the groups in the path from @group to the root) in the
438 ++ * root scheduler.
439 ++ *
440 ++ * We allocate all the missing nodes in the path from the leaf cgroup
441 ++ * to the root and we connect the nodes only after all the allocations
442 ++ * have been successful.
443 ++ */
444 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
445 ++ struct cgroup *cgroup)
446 ++{
447 ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
448 ++ struct bfq_group *bfqg;
449 ++
450 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
451 ++ if (bfqg != NULL)
452 ++ return bfqg;
453 ++
454 ++ bfqg = bfq_group_chain_alloc(bfqd, cgroup);
455 ++ if (bfqg != NULL)
456 ++ bfq_group_chain_link(bfqd, cgroup, bfqg);
457 ++ else
458 ++ bfqg = bfqd->root_group;
459 ++
460 ++ return bfqg;
461 ++}
462 ++
463 ++/**
464 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
465 ++ * @bfqd: queue descriptor.
466 ++ * @bfqq: the queue to move.
467 ++ * @entity: @bfqq's entity.
468 ++ * @bfqg: the group to move to.
469 ++ *
470 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
471 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
472 ++ *
473 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
474 ++ * not disappear (by now this just means that we are called under
475 ++ * rcu_read_lock()).
476 ++ */
477 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
478 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
479 ++{
480 ++ int busy, resume;
481 ++
482 ++ busy = bfq_bfqq_busy(bfqq);
483 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
484 ++
485 ++ BUG_ON(resume && !entity->on_st);
486 ++ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
487 ++
488 ++ if (busy) {
489 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
490 ++
491 ++ if (!resume)
492 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
493 ++ else
494 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
495 ++ } else if (entity->on_st)
496 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
497 ++
498 ++ /*
499 ++ * Here we use a reference to bfqg. We don't need a refcounter
500 ++ * as the cgroup reference will not be dropped, so that its
501 ++ * destroy() callback will not be invoked.
502 ++ */
503 ++ entity->parent = bfqg->my_entity;
504 ++ entity->sched_data = &bfqg->sched_data;
505 ++
506 ++ if (busy && resume)
507 ++ bfq_activate_bfqq(bfqd, bfqq);
508 ++
509 ++ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
510 ++ bfq_schedule_dispatch(bfqd);
511 ++}
512 ++
513 ++/**
514 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
515 ++ * @bfqd: the queue descriptor.
516 ++ * @bic: the bic to move.
517 ++ * @cgroup: the cgroup to move to.
518 ++ *
519 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
520 ++ * has to make sure that the reference to cgroup is valid across the call.
521 ++ *
522 ++ * NOTE: an alternative approach might have been to store the current
523 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
524 ++ * time here, at the price of slightly more complex code.
525 ++ */
526 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
527 ++ struct bfq_io_cq *bic,
528 ++ struct cgroup *cgroup)
529 ++{
530 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
531 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
532 ++ struct bfq_entity *entity;
533 ++ struct bfq_group *bfqg;
534 ++ struct bfqio_cgroup *bgrp;
535 ++
536 ++ bgrp = cgroup_to_bfqio(cgroup);
537 ++
538 ++ bfqg = bfq_find_alloc_group(bfqd, cgroup);
539 ++ if (async_bfqq != NULL) {
540 ++ entity = &async_bfqq->entity;
541 ++
542 ++ if (entity->sched_data != &bfqg->sched_data) {
543 ++ bic_set_bfqq(bic, NULL, 0);
544 ++ bfq_log_bfqq(bfqd, async_bfqq,
545 ++ "bic_change_group: %p %d",
546 ++ async_bfqq, atomic_read(&async_bfqq->ref));
547 ++ bfq_put_queue(async_bfqq);
548 ++ }
549 ++ }
550 ++
551 ++ if (sync_bfqq != NULL) {
552 ++ entity = &sync_bfqq->entity;
553 ++ if (entity->sched_data != &bfqg->sched_data)
554 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
555 ++ }
556 ++
557 ++ return bfqg;
558 ++}
559 ++
560 ++/**
561 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
562 ++ * @bic: the bic being migrated.
563 ++ * @cgroup: the destination cgroup.
564 ++ *
565 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
566 ++ * moved into its new parent group.
567 ++ */
568 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
569 ++ struct cgroup *cgroup)
570 ++{
571 ++ struct bfq_data *bfqd;
572 ++ unsigned long uninitialized_var(flags);
573 ++
574 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
575 ++ if (bfqd != NULL) {
576 ++ __bfq_bic_change_cgroup(bfqd, bic, cgroup);
577 ++ bfq_put_bfqd_unlock(bfqd, &flags);
578 ++ }
579 ++}
580 ++
581 ++/**
582 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
583 ++ * @bic: the @bic to update.
584 ++ *
585 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
586 ++ * We need this in addition to moving bics during the cgroup attach
587 ++ * phase because the task owning @bic could be at its first disk
588 ++ * access or we may end up in the root cgroup as the result of a
589 ++ * memory allocation failure and here we try to move to the right
590 ++ * group.
591 ++ *
592 ++ * Must be called under the queue lock. It is safe to use the returned
593 ++ * value even after the rcu_read_unlock() as the migration/destruction
594 ++ * paths act under the queue lock too. IOW it is impossible to race with
595 ++ * group migration/destruction and end up with an invalid group as:
596 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
597 ++ * has started execution, as current holds a reference to it,
598 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
599 ++ * migrated to a different cgroup] its attach() callback will have
600 ++ * taken care of remove all the references to the old cgroup data.
601 ++ */
602 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
603 ++{
604 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
605 ++ struct bfq_group *bfqg;
606 ++ struct cgroup *cgroup;
607 ++
608 ++ BUG_ON(bfqd == NULL);
609 ++
610 ++ rcu_read_lock();
611 ++ cgroup = task_cgroup(current, bfqio_subsys_id);
612 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
613 ++ rcu_read_unlock();
614 ++
615 ++ return bfqg;
616 ++}
617 ++
618 ++/**
619 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
620 ++ * @st: the service tree being flushed.
621 ++ */
622 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
623 ++{
624 ++ struct bfq_entity *entity = st->first_idle;
625 ++
626 ++ for (; entity != NULL; entity = st->first_idle)
627 ++ __bfq_deactivate_entity(entity, 0);
628 ++}
629 ++
630 ++/**
631 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
632 ++ * @bfqd: the device data structure with the root group.
633 ++ * @entity: the entity to move.
634 ++ */
635 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
636 ++ struct bfq_entity *entity)
637 ++{
638 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
639 ++
640 ++ BUG_ON(bfqq == NULL);
641 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
642 ++ return;
643 ++}
644 ++
645 ++/**
646 ++ * bfq_reparent_active_entities - move to the root group all active entities.
647 ++ * @bfqd: the device data structure with the root group.
648 ++ * @bfqg: the group to move from.
649 ++ * @st: the service tree with the entities.
650 ++ *
651 ++ * Needs queue_lock to be taken and reference to be valid over the call.
652 ++ */
653 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
654 ++ struct bfq_group *bfqg,
655 ++ struct bfq_service_tree *st)
656 ++{
657 ++ struct rb_root *active = &st->active;
658 ++ struct bfq_entity *entity = NULL;
659 ++
660 ++ if (!RB_EMPTY_ROOT(&st->active))
661 ++ entity = bfq_entity_of(rb_first(active));
662 ++
663 ++ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
664 ++ bfq_reparent_leaf_entity(bfqd, entity);
665 ++
666 ++ if (bfqg->sched_data.active_entity != NULL)
667 ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
668 ++
669 ++ return;
670 ++}
671 ++
672 ++/**
673 ++ * bfq_destroy_group - destroy @bfqg.
674 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
675 ++ * @bfqg: the group being destroyed.
676 ++ *
677 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
678 ++ */
679 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
680 ++{
681 ++ struct bfq_data *bfqd;
682 ++ struct bfq_service_tree *st;
683 ++ struct bfq_entity *entity = bfqg->my_entity;
684 ++ unsigned long uninitialized_var(flags);
685 ++ int i;
686 ++
687 ++ hlist_del(&bfqg->group_node);
688 ++
689 ++ /*
690 ++ * Empty all service_trees belonging to this group before deactivating
691 ++ * the group itself.
692 ++ */
693 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
694 ++ st = bfqg->sched_data.service_tree + i;
695 ++
696 ++ /*
697 ++ * The idle tree may still contain bfq_queues belonging
698 ++ * to exited task because they never migrated to a different
699 ++ * cgroup from the one being destroyed now. Noone else
700 ++ * can access them so it's safe to act without any lock.
701 ++ */
702 ++ bfq_flush_idle_tree(st);
703 ++
704 ++ /*
705 ++ * It may happen that some queues are still active
706 ++ * (busy) upon group destruction (if the corresponding
707 ++ * processes have been forced to terminate). We move
708 ++ * all the leaf entities corresponding to these queues
709 ++ * to the root_group.
710 ++ * Also, it may happen that the group has an entity
711 ++ * under service, which is disconnected from the active
712 ++ * tree: it must be moved, too.
713 ++ * There is no need to put the sync queues, as the
714 ++ * scheduler has taken no reference.
715 ++ */
716 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
717 ++ if (bfqd != NULL) {
718 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
719 ++ bfq_put_bfqd_unlock(bfqd, &flags);
720 ++ }
721 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
722 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
723 ++ }
724 ++ BUG_ON(bfqg->sched_data.next_active != NULL);
725 ++ BUG_ON(bfqg->sched_data.active_entity != NULL);
726 ++
727 ++ /*
728 ++ * We may race with device destruction, take extra care when
729 ++ * dereferencing bfqg->bfqd.
730 ++ */
731 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
732 ++ if (bfqd != NULL) {
733 ++ hlist_del(&bfqg->bfqd_node);
734 ++ __bfq_deactivate_entity(entity, 0);
735 ++ bfq_put_async_queues(bfqd, bfqg);
736 ++ bfq_put_bfqd_unlock(bfqd, &flags);
737 ++ }
738 ++ BUG_ON(entity->tree != NULL);
739 ++
740 ++ /*
741 ++ * No need to defer the kfree() to the end of the RCU grace
742 ++ * period: we are called from the destroy() callback of our
743 ++ * cgroup, so we can be sure that noone is a) still using
744 ++ * this cgroup or b) doing lookups in it.
745 ++ */
746 ++ kfree(bfqg);
747 ++}
748 ++
749 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
750 ++{
751 ++ struct hlist_node *tmp;
752 ++ struct bfq_group *bfqg;
753 ++
754 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
755 ++ bfq_end_raising_async_queues(bfqd, bfqg);
756 ++}
757 ++
758 ++/**
759 ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
760 ++ * @bfqd: the device descriptor being exited.
761 ++ *
762 ++ * When the device exits we just make sure that no lookup can return
763 ++ * the now unused group structures. They will be deallocated on cgroup
764 ++ * destruction.
765 ++ */
766 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
767 ++{
768 ++ struct hlist_node *tmp;
769 ++ struct bfq_group *bfqg;
770 ++
771 ++ bfq_log(bfqd, "disconnect_groups beginning") ;
772 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
773 ++ hlist_del(&bfqg->bfqd_node);
774 ++
775 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
776 ++
777 ++ /*
778 ++ * Don't remove from the group hash, just set an
779 ++ * invalid key. No lookups can race with the
780 ++ * assignment as bfqd is being destroyed; this
781 ++ * implies also that new elements cannot be added
782 ++ * to the list.
783 ++ */
784 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
785 ++
786 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
787 ++ bfqg) ;
788 ++ bfq_put_async_queues(bfqd, bfqg);
789 ++ }
790 ++}
791 ++
792 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
793 ++{
794 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
795 ++ struct bfq_group *bfqg = bfqd->root_group;
796 ++
797 ++ bfq_put_async_queues(bfqd, bfqg);
798 ++
799 ++ spin_lock_irq(&bgrp->lock);
800 ++ hlist_del_rcu(&bfqg->group_node);
801 ++ spin_unlock_irq(&bgrp->lock);
802 ++
803 ++ /*
804 ++ * No need to synchronize_rcu() here: since the device is gone
805 ++ * there cannot be any read-side access to its root_group.
806 ++ */
807 ++ kfree(bfqg);
808 ++}
809 ++
810 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
811 ++{
812 ++ struct bfq_group *bfqg;
813 ++ struct bfqio_cgroup *bgrp;
814 ++ int i;
815 ++
816 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
817 ++ if (bfqg == NULL)
818 ++ return NULL;
819 ++
820 ++ bfqg->entity.parent = NULL;
821 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
822 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
823 ++
824 ++ bgrp = &bfqio_root_cgroup;
825 ++ spin_lock_irq(&bgrp->lock);
826 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
827 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
828 ++ spin_unlock_irq(&bgrp->lock);
829 ++
830 ++ return bfqg;
831 ++}
832 ++
833 ++#define SHOW_FUNCTION(__VAR) \
834 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
835 ++ struct cftype *cftype) \
836 ++{ \
837 ++ struct bfqio_cgroup *bgrp; \
838 ++ u64 ret; \
839 ++ \
840 ++ if (!cgroup_lock_live_group(cgroup)) \
841 ++ return -ENODEV; \
842 ++ \
843 ++ bgrp = cgroup_to_bfqio(cgroup); \
844 ++ spin_lock_irq(&bgrp->lock); \
845 ++ ret = bgrp->__VAR; \
846 ++ spin_unlock_irq(&bgrp->lock); \
847 ++ \
848 ++ cgroup_unlock(); \
849 ++ \
850 ++ return ret; \
851 ++}
852 ++
853 ++SHOW_FUNCTION(weight);
854 ++SHOW_FUNCTION(ioprio);
855 ++SHOW_FUNCTION(ioprio_class);
856 ++#undef SHOW_FUNCTION
857 ++
858 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
859 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
860 ++ struct cftype *cftype, \
861 ++ u64 val) \
862 ++{ \
863 ++ struct bfqio_cgroup *bgrp; \
864 ++ struct bfq_group *bfqg; \
865 ++ \
866 ++ if (val < (__MIN) || val > (__MAX)) \
867 ++ return -EINVAL; \
868 ++ \
869 ++ if (!cgroup_lock_live_group(cgroup)) \
870 ++ return -ENODEV; \
871 ++ \
872 ++ bgrp = cgroup_to_bfqio(cgroup); \
873 ++ \
874 ++ spin_lock_irq(&bgrp->lock); \
875 ++ bgrp->__VAR = (unsigned short)val; \
876 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
877 ++ /* \
878 ++ * Setting the ioprio_changed flag of the entity \
879 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
880 ++ * the value of the weight to its ioprio mapping. \
881 ++ * Set the flag only if necessary. \
882 ++ */ \
883 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
884 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
885 ++ smp_wmb(); \
886 ++ bfqg->entity.ioprio_changed = 1; \
887 ++ } \
888 ++ } \
889 ++ spin_unlock_irq(&bgrp->lock); \
890 ++ \
891 ++ cgroup_unlock(); \
892 ++ \
893 ++ return 0; \
894 ++}
895 ++
896 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
897 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
898 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
899 ++#undef STORE_FUNCTION
900 ++
901 ++static struct cftype bfqio_files[] = {
902 ++ {
903 ++ .name = "weight",
904 ++ .read_u64 = bfqio_cgroup_weight_read,
905 ++ .write_u64 = bfqio_cgroup_weight_write,
906 ++ },
907 ++ {
908 ++ .name = "ioprio",
909 ++ .read_u64 = bfqio_cgroup_ioprio_read,
910 ++ .write_u64 = bfqio_cgroup_ioprio_write,
911 ++ },
912 ++ {
913 ++ .name = "ioprio_class",
914 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
915 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
916 ++ },
917 ++ { }, /* terminate */
918 ++};
919 ++
920 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
921 ++{
922 ++ struct bfqio_cgroup *bgrp;
923 ++
924 ++ if (cgroup->parent != NULL) {
925 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
926 ++ if (bgrp == NULL)
927 ++ return ERR_PTR(-ENOMEM);
928 ++ } else
929 ++ bgrp = &bfqio_root_cgroup;
930 ++
931 ++ spin_lock_init(&bgrp->lock);
932 ++ INIT_HLIST_HEAD(&bgrp->group_data);
933 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
934 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
935 ++
936 ++ return &bgrp->css;
937 ++}
938 ++
939 ++/*
940 ++ * We cannot support shared io contexts, as we have no means to support
941 ++ * two tasks with the same ioc in two different groups without major rework
942 ++ * of the main bic/bfqq data structures. By now we allow a task to change
943 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
944 ++ * behavior is that a group containing a task that forked using CLONE_IO
945 ++ * will not be destroyed until the tasks sharing the ioc die.
946 ++ */
947 ++static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
948 ++{
949 ++ struct task_struct *task;
950 ++ struct io_context *ioc;
951 ++ int ret = 0;
952 ++
953 ++ cgroup_taskset_for_each(task, cgroup, tset) {
954 ++ /* task_lock() is needed to avoid races with exit_io_context() */
955 ++ task_lock(task);
956 ++ ioc = task->io_context;
957 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
958 ++ /*
959 ++ * ioc == NULL means that the task is either too young or
960 ++ * exiting: if it has still no ioc the ioc can't be shared,
961 ++ * if the task is exiting the attach will fail anyway, no
962 ++ * matter what we return here.
963 ++ */
964 ++ ret = -EINVAL;
965 ++ task_unlock(task);
966 ++ if (ret)
967 ++ break;
968 ++ }
969 ++
970 ++ return ret;
971 ++}
972 ++
973 ++static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
974 ++{
975 ++ struct task_struct *task;
976 ++ struct io_context *ioc;
977 ++ struct io_cq *icq;
978 ++
979 ++ /*
980 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
981 ++ * new group has not yet been tested.
982 ++ */
983 ++ cgroup_taskset_for_each(task, cgroup, tset) {
984 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
985 ++ if (ioc) {
986 ++ /*
987 ++ * Handle cgroup change here.
988 ++ */
989 ++ rcu_read_lock();
990 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
991 ++ if (!strncmp(icq->q->elevator->type->elevator_name,
992 ++ "bfq", ELV_NAME_MAX))
993 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
994 ++ cgroup);
995 ++ rcu_read_unlock();
996 ++ put_io_context(ioc);
997 ++ }
998 ++ }
999 ++}
1000 ++
1001 ++static void bfqio_destroy(struct cgroup *cgroup)
1002 ++{
1003 ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
1004 ++ struct hlist_node *tmp;
1005 ++ struct bfq_group *bfqg;
1006 ++
1007 ++ /*
1008 ++ * Since we are destroying the cgroup, there are no more tasks
1009 ++ * referencing it, and all the RCU grace periods that may have
1010 ++ * referenced it are ended (as the destruction of the parent
1011 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1012 ++ * anything else and we don't need any synchronization.
1013 ++ */
1014 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1015 ++ bfq_destroy_group(bgrp, bfqg);
1016 ++
1017 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
1018 ++
1019 ++ kfree(bgrp);
1020 ++}
1021 ++
1022 ++struct cgroup_subsys bfqio_subsys = {
1023 ++ .name = "bfqio",
1024 ++ .css_alloc = bfqio_create,
1025 ++ .can_attach = bfqio_can_attach,
1026 ++ .attach = bfqio_attach,
1027 ++ .css_free = bfqio_destroy,
1028 ++ .subsys_id = bfqio_subsys_id,
1029 ++ .base_cftypes = bfqio_files,
1030 ++};
1031 ++#else
1032 ++static inline void bfq_init_entity(struct bfq_entity *entity,
1033 ++ struct bfq_group *bfqg)
1034 ++{
1035 ++ entity->weight = entity->new_weight;
1036 ++ entity->orig_weight = entity->new_weight;
1037 ++ entity->ioprio = entity->new_ioprio;
1038 ++ entity->ioprio_class = entity->new_ioprio_class;
1039 ++ entity->sched_data = &bfqg->sched_data;
1040 ++}
1041 ++
1042 ++static inline struct bfq_group *
1043 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1044 ++{
1045 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
1046 ++ return bfqd->root_group;
1047 ++}
1048 ++
1049 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1050 ++ struct bfq_queue *bfqq,
1051 ++ struct bfq_entity *entity,
1052 ++ struct bfq_group *bfqg)
1053 ++{
1054 ++}
1055 ++
1056 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
1057 ++{
1058 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1059 ++}
1060 ++
1061 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1062 ++{
1063 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
1064 ++}
1065 ++
1066 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
1067 ++{
1068 ++ kfree(bfqd->root_group);
1069 ++}
1070 ++
1071 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1072 ++{
1073 ++ struct bfq_group *bfqg;
1074 ++ int i;
1075 ++
1076 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1077 ++ if (bfqg == NULL)
1078 ++ return NULL;
1079 ++
1080 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1081 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1082 ++
1083 ++ return bfqg;
1084 ++}
1085 ++#endif
1086 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1087 +new file mode 100644
1088 +index 0000000..326e3ec
1089 +--- /dev/null
1090 ++++ b/block/bfq-ioc.c
1091 +@@ -0,0 +1,36 @@
1092 ++/*
1093 ++ * BFQ: I/O context handling.
1094 ++ *
1095 ++ * Based on ideas and code from CFQ:
1096 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1097 ++ *
1098 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1099 ++ * Paolo Valente <paolo.valente@×××××××.it>
1100 ++ *
1101 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1102 ++ */
1103 ++
1104 ++/**
1105 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1106 ++ * @icq: the iocontext queue.
1107 ++ */
1108 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1109 ++{
1110 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
1111 ++ return container_of(icq, struct bfq_io_cq, icq);
1112 ++}
1113 ++
1114 ++/**
1115 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1116 ++ * @bfqd: the lookup key.
1117 ++ * @ioc: the io_context of the process doing I/O.
1118 ++ *
1119 ++ * Queue lock must be held.
1120 ++ */
1121 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1122 ++ struct io_context *ioc)
1123 ++{
1124 ++ if(ioc)
1125 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1126 ++ return NULL;
1127 ++}
1128 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1129 +new file mode 100644
1130 +index 0000000..b230927
1131 +--- /dev/null
1132 ++++ b/block/bfq-iosched.c
1133 +@@ -0,0 +1,3070 @@
1134 ++/*
1135 ++ * BFQ, or Budget Fair Queueing, disk scheduler.
1136 ++ *
1137 ++ * Based on ideas and code from CFQ:
1138 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1139 ++ *
1140 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1141 ++ * Paolo Valente <paolo.valente@×××××××.it>
1142 ++ *
1143 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1144 ++ *
1145 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1146 ++ *
1147 ++ * BFQ is a proportional share disk scheduling algorithm based on the
1148 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1149 ++ * measured in number of sectors, to tasks instead of time slices.
1150 ++ * The disk is not granted to the active task for a given time slice,
1151 ++ * but until it has exahusted its assigned budget. This change from
1152 ++ * the time to the service domain allows BFQ to distribute the disk
1153 ++ * bandwidth among tasks as desired, without any distortion due to
1154 ++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
1155 ++ * internal scheduler, called B-WF2Q+, to schedule tasks according to
1156 ++ * their budgets. Thanks to this accurate scheduler, BFQ can afford
1157 ++ * to assign high budgets to disk-bound non-seeky tasks (to boost the
1158 ++ * throughput), and yet guarantee low latencies to interactive and
1159 ++ * soft real-time applications.
1160 ++ *
1161 ++ * BFQ has been introduced in [1], where the interested reader can
1162 ++ * find an accurate description of the algorithm, the bandwidth
1163 ++ * distribution and latency guarantees it provides, plus formal proofs
1164 ++ * of all the properties. With respect to the algorithm presented in
1165 ++ * the paper, this implementation adds several little heuristics, and
1166 ++ * a hierarchical extension, based on H-WF2Q+.
1167 ++ *
1168 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1169 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1170 ++ * complexity derives from the one introduced with EEVDF in [3].
1171 ++ *
1172 ++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
1173 ++ * with Deterministic Guarantees on Bandwidth Distribution,'',
1174 ++ * IEEE Transactions on Computer, May 2010.
1175 ++ *
1176 ++ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
1177 ++ *
1178 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1179 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1180 ++ * Oct 1997.
1181 ++ *
1182 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1183 ++ *
1184 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1185 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
1186 ++ * Resource Allocation,'' technical report.
1187 ++ *
1188 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1189 ++ */
1190 ++#include <linux/module.h>
1191 ++#include <linux/slab.h>
1192 ++#include <linux/blkdev.h>
1193 ++#include <linux/cgroup.h>
1194 ++#include <linux/elevator.h>
1195 ++#include <linux/jiffies.h>
1196 ++#include <linux/rbtree.h>
1197 ++#include <linux/ioprio.h>
1198 ++#include "bfq.h"
1199 ++#include "blk.h"
1200 ++
1201 ++/* Max number of dispatches in one round of service. */
1202 ++static const int bfq_quantum = 4;
1203 ++
1204 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1205 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1206 ++
1207 ++/* Maximum backwards seek, in KiB. */
1208 ++static const int bfq_back_max = 16 * 1024;
1209 ++
1210 ++/* Penalty of a backwards seek, in number of sectors. */
1211 ++static const int bfq_back_penalty = 2;
1212 ++
1213 ++/* Idling period duration, in jiffies. */
1214 ++static int bfq_slice_idle = HZ / 125;
1215 ++
1216 ++/* Default maximum budget values, in sectors and number of requests. */
1217 ++static const int bfq_default_max_budget = 16 * 1024;
1218 ++static const int bfq_max_budget_async_rq = 4;
1219 ++
1220 ++/*
1221 ++ * Async to sync throughput distribution is controlled as follows:
1222 ++ * when an async request is served, the entity is charged the number
1223 ++ * of sectors of the request, multipled by the factor below
1224 ++ */
1225 ++static const int bfq_async_charge_factor = 10;
1226 ++
1227 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
1228 ++static const int bfq_timeout_sync = HZ / 8;
1229 ++static int bfq_timeout_async = HZ / 25;
1230 ++
1231 ++struct kmem_cache *bfq_pool;
1232 ++
1233 ++/* Below this threshold (in ms), we consider thinktime immediate. */
1234 ++#define BFQ_MIN_TT 2
1235 ++
1236 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
1237 ++#define BFQ_HW_QUEUE_THRESHOLD 4
1238 ++#define BFQ_HW_QUEUE_SAMPLES 32
1239 ++
1240 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1241 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1242 ++
1243 ++/* Min samples used for peak rate estimation (for autotuning). */
1244 ++#define BFQ_PEAK_RATE_SAMPLES 32
1245 ++
1246 ++/* Shift used for peak rate fixed precision calculations. */
1247 ++#define BFQ_RATE_SHIFT 16
1248 ++
1249 ++/*
1250 ++ * The duration of the weight raising for interactive applications is
1251 ++ * computed automatically (as default behaviour), using the following
1252 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
1253 ++ * disk, and R and T are two reference parameters. In particular, R is
1254 ++ * the peak rate of a reference disk, and T is about the maximum time
1255 ++ * for starting popular large applications on that disk, under BFQ and
1256 ++ * while reading two files in parallel. Finally, BFQ uses two
1257 ++ * different pairs (R, T) depending on whether the disk is rotational
1258 ++ * or non-rotational.
1259 ++ */
1260 ++#define T_rot (msecs_to_jiffies(5500))
1261 ++#define T_nonrot (msecs_to_jiffies(2000))
1262 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1263 ++#define R_rot 17415
1264 ++#define R_nonrot 34791
1265 ++
1266 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1267 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1268 ++
1269 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1270 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1271 ++
1272 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1273 ++
1274 ++#include "bfq-ioc.c"
1275 ++#include "bfq-sched.c"
1276 ++#include "bfq-cgroup.c"
1277 ++
1278 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1279 ++ IOPRIO_CLASS_IDLE)
1280 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1281 ++ IOPRIO_CLASS_RT)
1282 ++
1283 ++#define bfq_sample_valid(samples) ((samples) > 80)
1284 ++
1285 ++/*
1286 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1287 ++ * set (in which case it could also be a direct WRITE).
1288 ++ */
1289 ++static inline int bfq_bio_sync(struct bio *bio)
1290 ++{
1291 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1292 ++ return 1;
1293 ++
1294 ++ return 0;
1295 ++}
1296 ++
1297 ++/*
1298 ++ * Scheduler run of queue, if there are requests pending and no one in the
1299 ++ * driver that will restart queueing.
1300 ++ */
1301 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1302 ++{
1303 ++ if (bfqd->queued != 0) {
1304 ++ bfq_log(bfqd, "schedule dispatch");
1305 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1306 ++ }
1307 ++}
1308 ++
1309 ++/*
1310 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1311 ++ * We choose the request that is closesr to the head right now. Distance
1312 ++ * behind the head is penalized and only allowed to a certain extent.
1313 ++ */
1314 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
1315 ++ struct request *rq1,
1316 ++ struct request *rq2,
1317 ++ sector_t last)
1318 ++{
1319 ++ sector_t s1, s2, d1 = 0, d2 = 0;
1320 ++ unsigned long back_max;
1321 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1322 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1323 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1324 ++
1325 ++ if (rq1 == NULL || rq1 == rq2)
1326 ++ return rq2;
1327 ++ if (rq2 == NULL)
1328 ++ return rq1;
1329 ++
1330 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1331 ++ return rq1;
1332 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1333 ++ return rq2;
1334 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1335 ++ return rq1;
1336 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1337 ++ return rq2;
1338 ++
1339 ++ s1 = blk_rq_pos(rq1);
1340 ++ s2 = blk_rq_pos(rq2);
1341 ++
1342 ++ /*
1343 ++ * By definition, 1KiB is 2 sectors.
1344 ++ */
1345 ++ back_max = bfqd->bfq_back_max * 2;
1346 ++
1347 ++ /*
1348 ++ * Strict one way elevator _except_ in the case where we allow
1349 ++ * short backward seeks which are biased as twice the cost of a
1350 ++ * similar forward seek.
1351 ++ */
1352 ++ if (s1 >= last)
1353 ++ d1 = s1 - last;
1354 ++ else if (s1 + back_max >= last)
1355 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
1356 ++ else
1357 ++ wrap |= BFQ_RQ1_WRAP;
1358 ++
1359 ++ if (s2 >= last)
1360 ++ d2 = s2 - last;
1361 ++ else if (s2 + back_max >= last)
1362 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
1363 ++ else
1364 ++ wrap |= BFQ_RQ2_WRAP;
1365 ++
1366 ++ /* Found required data */
1367 ++
1368 ++ /*
1369 ++ * By doing switch() on the bit mask "wrap" we avoid having to
1370 ++ * check two variables for all permutations: --> faster!
1371 ++ */
1372 ++ switch (wrap) {
1373 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1374 ++ if (d1 < d2)
1375 ++ return rq1;
1376 ++ else if (d2 < d1)
1377 ++ return rq2;
1378 ++ else {
1379 ++ if (s1 >= s2)
1380 ++ return rq1;
1381 ++ else
1382 ++ return rq2;
1383 ++ }
1384 ++
1385 ++ case BFQ_RQ2_WRAP:
1386 ++ return rq1;
1387 ++ case BFQ_RQ1_WRAP:
1388 ++ return rq2;
1389 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1390 ++ default:
1391 ++ /*
1392 ++ * Since both rqs are wrapped,
1393 ++ * start with the one that's further behind head
1394 ++ * (--> only *one* back seek required),
1395 ++ * since back seek takes more time than forward.
1396 ++ */
1397 ++ if (s1 <= s2)
1398 ++ return rq1;
1399 ++ else
1400 ++ return rq2;
1401 ++ }
1402 ++}
1403 ++
1404 ++static struct bfq_queue *
1405 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1406 ++ sector_t sector, struct rb_node **ret_parent,
1407 ++ struct rb_node ***rb_link)
1408 ++{
1409 ++ struct rb_node **p, *parent;
1410 ++ struct bfq_queue *bfqq = NULL;
1411 ++
1412 ++ parent = NULL;
1413 ++ p = &root->rb_node;
1414 ++ while (*p) {
1415 ++ struct rb_node **n;
1416 ++
1417 ++ parent = *p;
1418 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1419 ++
1420 ++ /*
1421 ++ * Sort strictly based on sector. Smallest to the left,
1422 ++ * largest to the right.
1423 ++ */
1424 ++ if (sector > blk_rq_pos(bfqq->next_rq))
1425 ++ n = &(*p)->rb_right;
1426 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
1427 ++ n = &(*p)->rb_left;
1428 ++ else
1429 ++ break;
1430 ++ p = n;
1431 ++ bfqq = NULL;
1432 ++ }
1433 ++
1434 ++ *ret_parent = parent;
1435 ++ if (rb_link)
1436 ++ *rb_link = p;
1437 ++
1438 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1439 ++ (long long unsigned)sector,
1440 ++ bfqq != NULL ? bfqq->pid : 0);
1441 ++
1442 ++ return bfqq;
1443 ++}
1444 ++
1445 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1446 ++{
1447 ++ struct rb_node **p, *parent;
1448 ++ struct bfq_queue *__bfqq;
1449 ++
1450 ++ if (bfqq->pos_root != NULL) {
1451 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1452 ++ bfqq->pos_root = NULL;
1453 ++ }
1454 ++
1455 ++ if (bfq_class_idle(bfqq))
1456 ++ return;
1457 ++ if (!bfqq->next_rq)
1458 ++ return;
1459 ++
1460 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
1461 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1462 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
1463 ++ if (__bfqq == NULL) {
1464 ++ rb_link_node(&bfqq->pos_node, parent, p);
1465 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1466 ++ } else
1467 ++ bfqq->pos_root = NULL;
1468 ++}
1469 ++
1470 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1471 ++ struct bfq_queue *bfqq,
1472 ++ struct request *last)
1473 ++{
1474 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
1475 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
1476 ++ struct request *next = NULL, *prev = NULL;
1477 ++
1478 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1479 ++
1480 ++ if (rbprev != NULL)
1481 ++ prev = rb_entry_rq(rbprev);
1482 ++
1483 ++ if (rbnext != NULL)
1484 ++ next = rb_entry_rq(rbnext);
1485 ++ else {
1486 ++ rbnext = rb_first(&bfqq->sort_list);
1487 ++ if (rbnext && rbnext != &last->rb_node)
1488 ++ next = rb_entry_rq(rbnext);
1489 ++ }
1490 ++
1491 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1492 ++}
1493 ++
1494 ++static void bfq_del_rq_rb(struct request *rq)
1495 ++{
1496 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1497 ++ struct bfq_data *bfqd = bfqq->bfqd;
1498 ++ const int sync = rq_is_sync(rq);
1499 ++
1500 ++ BUG_ON(bfqq->queued[sync] == 0);
1501 ++ bfqq->queued[sync]--;
1502 ++ bfqd->queued--;
1503 ++
1504 ++ elv_rb_del(&bfqq->sort_list, rq);
1505 ++
1506 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1507 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
1508 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1509 ++ /*
1510 ++ * Remove queue from request-position tree as it is empty.
1511 ++ */
1512 ++ if (bfqq->pos_root != NULL) {
1513 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1514 ++ bfqq->pos_root = NULL;
1515 ++ }
1516 ++ }
1517 ++}
1518 ++
1519 ++/* see the definition of bfq_async_charge_factor for details */
1520 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
1521 ++ struct bfq_queue *bfqq)
1522 ++{
1523 ++ return blk_rq_sectors(rq) *
1524 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1525 ++ bfq_async_charge_factor));
1526 ++}
1527 ++
1528 ++/**
1529 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
1530 ++ * @bfqd: the device data the queue belongs to.
1531 ++ * @bfqq: the queue to update.
1532 ++ *
1533 ++ * If the first request of a queue changes we make sure that the queue
1534 ++ * has enough budget to serve at least its first request (if the
1535 ++ * request has grown). We do this because if the queue has not enough
1536 ++ * budget for its first request, it has to go through two dispatch
1537 ++ * rounds to actually get it dispatched.
1538 ++ */
1539 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
1540 ++ struct bfq_queue *bfqq)
1541 ++{
1542 ++ struct bfq_entity *entity = &bfqq->entity;
1543 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1544 ++ struct request *next_rq = bfqq->next_rq;
1545 ++ unsigned long new_budget;
1546 ++
1547 ++ if (next_rq == NULL)
1548 ++ return;
1549 ++
1550 ++ if (bfqq == bfqd->active_queue)
1551 ++ /*
1552 ++ * In order not to break guarantees, budgets cannot be
1553 ++ * changed after an entity has been selected.
1554 ++ */
1555 ++ return;
1556 ++
1557 ++ BUG_ON(entity->tree != &st->active);
1558 ++ BUG_ON(entity == entity->sched_data->active_entity);
1559 ++
1560 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
1561 ++ bfq_serv_to_charge(next_rq, bfqq));
1562 ++ entity->budget = new_budget;
1563 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1564 ++ bfq_activate_bfqq(bfqd, bfqq);
1565 ++}
1566 ++
1567 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1568 ++{
1569 ++ u64 dur;
1570 ++
1571 ++ if (bfqd->bfq_raising_max_time > 0)
1572 ++ return bfqd->bfq_raising_max_time;
1573 ++
1574 ++ dur = bfqd->RT_prod;
1575 ++ do_div(dur, bfqd->peak_rate);
1576 ++
1577 ++ return dur;
1578 ++}
1579 ++
1580 ++static void bfq_add_rq_rb(struct request *rq)
1581 ++{
1582 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1583 ++ struct bfq_entity *entity = &bfqq->entity;
1584 ++ struct bfq_data *bfqd = bfqq->bfqd;
1585 ++ struct request *next_rq, *prev;
1586 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
1587 ++ int idle_for_long_time = bfqq->budget_timeout +
1588 ++ bfqd->bfq_raising_min_idle_time < jiffies;
1589 ++
1590 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1591 ++ bfqq->queued[rq_is_sync(rq)]++;
1592 ++ bfqd->queued++;
1593 ++
1594 ++ elv_rb_add(&bfqq->sort_list, rq);
1595 ++
1596 ++ /*
1597 ++ * Check if this request is a better next-serve candidate.
1598 ++ */
1599 ++ prev = bfqq->next_rq;
1600 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1601 ++ BUG_ON(next_rq == NULL);
1602 ++ bfqq->next_rq = next_rq;
1603 ++
1604 ++ /*
1605 ++ * Adjust priority tree position, if next_rq changes.
1606 ++ */
1607 ++ if (prev != bfqq->next_rq)
1608 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
1609 ++
1610 ++ if (!bfq_bfqq_busy(bfqq)) {
1611 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1612 ++ bfqq->soft_rt_next_start < jiffies;
1613 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
1614 ++ bfq_serv_to_charge(next_rq, bfqq));
1615 ++
1616 ++ if (! bfqd->low_latency)
1617 ++ goto add_bfqq_busy;
1618 ++
1619 ++ /*
1620 ++ * If the queue is not being boosted and has been idle
1621 ++ * for enough time, start a weight-raising period
1622 ++ */
1623 ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
1624 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1625 ++ if (idle_for_long_time)
1626 ++ bfqq->raising_cur_max_time =
1627 ++ bfq_wrais_duration(bfqd);
1628 ++ else
1629 ++ bfqq->raising_cur_max_time =
1630 ++ bfqd->bfq_raising_rt_max_time;
1631 ++ bfq_log_bfqq(bfqd, bfqq,
1632 ++ "wrais starting at %llu msec,"
1633 ++ "rais_max_time %u",
1634 ++ bfqq->last_rais_start_finish,
1635 ++ jiffies_to_msecs(bfqq->
1636 ++ raising_cur_max_time));
1637 ++ } else if (old_raising_coeff > 1) {
1638 ++ if (idle_for_long_time)
1639 ++ bfqq->raising_cur_max_time =
1640 ++ bfq_wrais_duration(bfqd);
1641 ++ else if (bfqq->raising_cur_max_time ==
1642 ++ bfqd->bfq_raising_rt_max_time &&
1643 ++ !soft_rt) {
1644 ++ bfqq->raising_coeff = 1;
1645 ++ bfq_log_bfqq(bfqd, bfqq,
1646 ++ "wrais ending at %llu msec,"
1647 ++ "rais_max_time %u",
1648 ++ bfqq->last_rais_start_finish,
1649 ++ jiffies_to_msecs(bfqq->
1650 ++ raising_cur_max_time));
1651 ++ }
1652 ++ }
1653 ++ if (old_raising_coeff != bfqq->raising_coeff)
1654 ++ entity->ioprio_changed = 1;
1655 ++add_bfqq_busy:
1656 ++ bfq_add_bfqq_busy(bfqd, bfqq);
1657 ++ } else {
1658 ++ if(bfqd->low_latency && old_raising_coeff == 1 &&
1659 ++ !rq_is_sync(rq) &&
1660 ++ bfqq->last_rais_start_finish +
1661 ++ bfqd->bfq_raising_min_inter_arr_async < jiffies) {
1662 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1663 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1664 ++
1665 ++ entity->ioprio_changed = 1;
1666 ++ bfq_log_bfqq(bfqd, bfqq,
1667 ++ "non-idle wrais starting at %llu msec,"
1668 ++ "rais_max_time %u",
1669 ++ bfqq->last_rais_start_finish,
1670 ++ jiffies_to_msecs(bfqq->
1671 ++ raising_cur_max_time));
1672 ++ }
1673 ++ bfq_updated_next_req(bfqd, bfqq);
1674 ++ }
1675 ++
1676 ++ if(bfqd->low_latency &&
1677 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1678 ++ idle_for_long_time))
1679 ++ bfqq->last_rais_start_finish = jiffies;
1680 ++}
1681 ++
1682 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1683 ++{
1684 ++ elv_rb_del(&bfqq->sort_list, rq);
1685 ++ bfqq->queued[rq_is_sync(rq)]--;
1686 ++ bfqq->bfqd->queued--;
1687 ++ bfq_add_rq_rb(rq);
1688 ++}
1689 ++
1690 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1691 ++ struct bio *bio)
1692 ++{
1693 ++ struct task_struct *tsk = current;
1694 ++ struct bfq_io_cq *bic;
1695 ++ struct bfq_queue *bfqq;
1696 ++
1697 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1698 ++ if (bic == NULL)
1699 ++ return NULL;
1700 ++
1701 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1702 ++ if (bfqq != NULL) {
1703 ++ sector_t sector = bio->bi_sector + bio_sectors(bio);
1704 ++
1705 ++ return elv_rb_find(&bfqq->sort_list, sector);
1706 ++ }
1707 ++
1708 ++ return NULL;
1709 ++}
1710 ++
1711 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
1712 ++{
1713 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1714 ++
1715 ++ bfqd->rq_in_driver++;
1716 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1717 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1718 ++ (long long unsigned)bfqd->last_position);
1719 ++}
1720 ++
1721 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1722 ++{
1723 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1724 ++
1725 ++ WARN_ON(bfqd->rq_in_driver == 0);
1726 ++ bfqd->rq_in_driver--;
1727 ++}
1728 ++
1729 ++static void bfq_remove_request(struct request *rq)
1730 ++{
1731 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1732 ++ struct bfq_data *bfqd = bfqq->bfqd;
1733 ++
1734 ++ if (bfqq->next_rq == rq) {
1735 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1736 ++ bfq_updated_next_req(bfqd, bfqq);
1737 ++ }
1738 ++
1739 ++ list_del_init(&rq->queuelist);
1740 ++ bfq_del_rq_rb(rq);
1741 ++
1742 ++ if (rq->cmd_flags & REQ_META) {
1743 ++ WARN_ON(bfqq->meta_pending == 0);
1744 ++ bfqq->meta_pending--;
1745 ++ }
1746 ++}
1747 ++
1748 ++static int bfq_merge(struct request_queue *q, struct request **req,
1749 ++ struct bio *bio)
1750 ++{
1751 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1752 ++ struct request *__rq;
1753 ++
1754 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
1755 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1756 ++ *req = __rq;
1757 ++ return ELEVATOR_FRONT_MERGE;
1758 ++ }
1759 ++
1760 ++ return ELEVATOR_NO_MERGE;
1761 ++}
1762 ++
1763 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
1764 ++ int type)
1765 ++{
1766 ++ if (type == ELEVATOR_FRONT_MERGE) {
1767 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
1768 ++
1769 ++ bfq_reposition_rq_rb(bfqq, req);
1770 ++ }
1771 ++}
1772 ++
1773 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1774 ++ struct request *next)
1775 ++{
1776 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1777 ++
1778 ++ /*
1779 ++ * Reposition in fifo if next is older than rq.
1780 ++ */
1781 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1782 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1783 ++ list_move(&rq->queuelist, &next->queuelist);
1784 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
1785 ++ }
1786 ++
1787 ++ if (bfqq->next_rq == next)
1788 ++ bfqq->next_rq = rq;
1789 ++
1790 ++ bfq_remove_request(next);
1791 ++}
1792 ++
1793 ++/* Must be called with bfqq != NULL */
1794 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
1795 ++{
1796 ++ BUG_ON(bfqq == NULL);
1797 ++ bfqq->raising_coeff = 1;
1798 ++ bfqq->raising_cur_max_time = 0;
1799 ++ /* Trigger a weight change on the next activation of the queue */
1800 ++ bfqq->entity.ioprio_changed = 1;
1801 ++}
1802 ++
1803 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
1804 ++ struct bfq_group *bfqg)
1805 ++{
1806 ++ int i, j;
1807 ++
1808 ++ for (i = 0; i < 2; i++)
1809 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
1810 ++ if (bfqg->async_bfqq[i][j] != NULL)
1811 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
1812 ++ if (bfqg->async_idle_bfqq != NULL)
1813 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
1814 ++}
1815 ++
1816 ++static void bfq_end_raising(struct bfq_data *bfqd)
1817 ++{
1818 ++ struct bfq_queue *bfqq;
1819 ++
1820 ++ spin_lock_irq(bfqd->queue->queue_lock);
1821 ++
1822 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1823 ++ bfq_bfqq_end_raising(bfqq);
1824 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1825 ++ bfq_bfqq_end_raising(bfqq);
1826 ++ bfq_end_raising_async(bfqd);
1827 ++
1828 ++ spin_unlock_irq(bfqd->queue->queue_lock);
1829 ++}
1830 ++
1831 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1832 ++ struct bio *bio)
1833 ++{
1834 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1835 ++ struct bfq_io_cq *bic;
1836 ++ struct bfq_queue *bfqq;
1837 ++
1838 ++ /*
1839 ++ * Disallow merge of a sync bio into an async request.
1840 ++ */
1841 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1842 ++ return 0;
1843 ++
1844 ++ /*
1845 ++ * Lookup the bfqq that this bio will be queued with. Allow
1846 ++ * merge only if rq is queued there.
1847 ++ * Queue lock is held here.
1848 ++ */
1849 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
1850 ++ if (bic == NULL)
1851 ++ return 0;
1852 ++
1853 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1854 ++ return bfqq == RQ_BFQQ(rq);
1855 ++}
1856 ++
1857 ++static void __bfq_set_active_queue(struct bfq_data *bfqd,
1858 ++ struct bfq_queue *bfqq)
1859 ++{
1860 ++ if (bfqq != NULL) {
1861 ++ bfq_mark_bfqq_must_alloc(bfqq);
1862 ++ bfq_mark_bfqq_budget_new(bfqq);
1863 ++ bfq_clear_bfqq_fifo_expire(bfqq);
1864 ++
1865 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1866 ++
1867 ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
1868 ++ bfqq->entity.budget);
1869 ++ }
1870 ++
1871 ++ bfqd->active_queue = bfqq;
1872 ++}
1873 ++
1874 ++/*
1875 ++ * Get and set a new active queue for service.
1876 ++ */
1877 ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
1878 ++ struct bfq_queue *bfqq)
1879 ++{
1880 ++ if (!bfqq)
1881 ++ bfqq = bfq_get_next_queue(bfqd);
1882 ++ else
1883 ++ bfq_get_next_queue_forced(bfqd, bfqq);
1884 ++
1885 ++ __bfq_set_active_queue(bfqd, bfqq);
1886 ++ return bfqq;
1887 ++}
1888 ++
1889 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
1890 ++ struct request *rq)
1891 ++{
1892 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
1893 ++ return blk_rq_pos(rq) - bfqd->last_position;
1894 ++ else
1895 ++ return bfqd->last_position - blk_rq_pos(rq);
1896 ++}
1897 ++
1898 ++/*
1899 ++ * Return true if bfqq has no request pending and rq is close enough to
1900 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
1901 ++ * bfqq->next_rq
1902 ++ */
1903 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
1904 ++{
1905 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
1906 ++}
1907 ++
1908 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
1909 ++{
1910 ++ struct rb_root *root = &bfqd->rq_pos_tree;
1911 ++ struct rb_node *parent, *node;
1912 ++ struct bfq_queue *__bfqq;
1913 ++ sector_t sector = bfqd->last_position;
1914 ++
1915 ++ if (RB_EMPTY_ROOT(root))
1916 ++ return NULL;
1917 ++
1918 ++ /*
1919 ++ * First, if we find a request starting at the end of the last
1920 ++ * request, choose it.
1921 ++ */
1922 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1923 ++ if (__bfqq != NULL)
1924 ++ return __bfqq;
1925 ++
1926 ++ /*
1927 ++ * If the exact sector wasn't found, the parent of the NULL leaf
1928 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
1929 ++ * position).
1930 ++ */
1931 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1932 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1933 ++ return __bfqq;
1934 ++
1935 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
1936 ++ node = rb_next(&__bfqq->pos_node);
1937 ++ else
1938 ++ node = rb_prev(&__bfqq->pos_node);
1939 ++ if (node == NULL)
1940 ++ return NULL;
1941 ++
1942 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1943 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1944 ++ return __bfqq;
1945 ++
1946 ++ return NULL;
1947 ++}
1948 ++
1949 ++/*
1950 ++ * bfqd - obvious
1951 ++ * cur_bfqq - passed in so that we don't decide that the current queue
1952 ++ * is closely cooperating with itself.
1953 ++ *
1954 ++ * We are assuming that cur_bfqq has dispatched at least one request,
1955 ++ * and that bfqd->last_position reflects a position on the disk associated
1956 ++ * with the I/O issued by cur_bfqq.
1957 ++ */
1958 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
1959 ++ struct bfq_queue *cur_bfqq)
1960 ++{
1961 ++ struct bfq_queue *bfqq;
1962 ++
1963 ++ if (bfq_class_idle(cur_bfqq))
1964 ++ return NULL;
1965 ++ if (!bfq_bfqq_sync(cur_bfqq))
1966 ++ return NULL;
1967 ++ if (BFQQ_SEEKY(cur_bfqq))
1968 ++ return NULL;
1969 ++
1970 ++ /* If device has only one backlogged bfq_queue, don't search. */
1971 ++ if (bfqd->busy_queues == 1)
1972 ++ return NULL;
1973 ++
1974 ++ /*
1975 ++ * We should notice if some of the queues are cooperating, e.g.
1976 ++ * working closely on the same area of the disk. In that case,
1977 ++ * we can group them together and don't waste time idling.
1978 ++ */
1979 ++ bfqq = bfqq_close(bfqd);
1980 ++ if (bfqq == NULL || bfqq == cur_bfqq)
1981 ++ return NULL;
1982 ++
1983 ++ /*
1984 ++ * Do not merge queues from different bfq_groups.
1985 ++ */
1986 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
1987 ++ return NULL;
1988 ++
1989 ++ /*
1990 ++ * It only makes sense to merge sync queues.
1991 ++ */
1992 ++ if (!bfq_bfqq_sync(bfqq))
1993 ++ return NULL;
1994 ++ if (BFQQ_SEEKY(bfqq))
1995 ++ return NULL;
1996 ++
1997 ++ /*
1998 ++ * Do not merge queues of different priority classes.
1999 ++ */
2000 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2001 ++ return NULL;
2002 ++
2003 ++ return bfqq;
2004 ++}
2005 ++
2006 ++/*
2007 ++ * If enough samples have been computed, return the current max budget
2008 ++ * stored in bfqd, which is dynamically updated according to the
2009 ++ * estimated disk peak rate; otherwise return the default max budget
2010 ++ */
2011 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2012 ++{
2013 ++ if (bfqd->budgets_assigned < 194)
2014 ++ return bfq_default_max_budget;
2015 ++ else
2016 ++ return bfqd->bfq_max_budget;
2017 ++}
2018 ++
2019 ++/*
2020 ++ * Return min budget, which is a fraction of the current or default
2021 ++ * max budget (trying with 1/32)
2022 ++ */
2023 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2024 ++{
2025 ++ if (bfqd->budgets_assigned < 194)
2026 ++ return bfq_default_max_budget / 32;
2027 ++ else
2028 ++ return bfqd->bfq_max_budget / 32;
2029 ++}
2030 ++
2031 ++/*
2032 ++ * Decides whether idling should be done for given device and
2033 ++ * given active queue.
2034 ++ */
2035 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2036 ++ struct bfq_queue *active_bfqq)
2037 ++{
2038 ++ if (active_bfqq == NULL)
2039 ++ return false;
2040 ++ /*
2041 ++ * If device is SSD it has no seek penalty, disable idling; but
2042 ++ * do so only if:
2043 ++ * - device does not support queuing, otherwise we still have
2044 ++ * a problem with sync vs async workloads;
2045 ++ * - the queue is not weight-raised, to preserve guarantees.
2046 ++ */
2047 ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2048 ++ active_bfqq->raising_coeff == 1);
2049 ++}
2050 ++
2051 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2052 ++{
2053 ++ struct bfq_queue *bfqq = bfqd->active_queue;
2054 ++ struct bfq_io_cq *bic;
2055 ++ unsigned long sl;
2056 ++
2057 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2058 ++
2059 ++ /* Tasks have exited, don't wait. */
2060 ++ bic = bfqd->active_bic;
2061 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2062 ++ return;
2063 ++
2064 ++ bfq_mark_bfqq_wait_request(bfqq);
2065 ++
2066 ++ /*
2067 ++ * We don't want to idle for seeks, but we do want to allow
2068 ++ * fair distribution of slice time for a process doing back-to-back
2069 ++ * seeks. So allow a little bit of time for him to submit a new rq.
2070 ++ *
2071 ++ * To prevent processes with (partly) seeky workloads from
2072 ++ * being too ill-treated, grant them a small fraction of the
2073 ++ * assigned budget before reducing the waiting time to
2074 ++ * BFQ_MIN_TT. This happened to help reduce latency.
2075 ++ */
2076 ++ sl = bfqd->bfq_slice_idle;
2077 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2078 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2079 ++ bfqq->raising_coeff == 1)
2080 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2081 ++ else if (bfqq->raising_coeff > 1)
2082 ++ sl = sl * 3;
2083 ++ bfqd->last_idling_start = ktime_get();
2084 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2085 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
2086 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2087 ++}
2088 ++
2089 ++/*
2090 ++ * Set the maximum time for the active queue to consume its
2091 ++ * budget. This prevents seeky processes from lowering the disk
2092 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
2093 ++ */
2094 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2095 ++{
2096 ++ struct bfq_queue *bfqq = bfqd->active_queue;
2097 ++ unsigned int timeout_coeff;
2098 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2099 ++ timeout_coeff = 1;
2100 ++ else
2101 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2102 ++
2103 ++ bfqd->last_budget_start = ktime_get();
2104 ++
2105 ++ bfq_clear_bfqq_budget_new(bfqq);
2106 ++ bfqq->budget_timeout = jiffies +
2107 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2108 ++
2109 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2110 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2111 ++ timeout_coeff));
2112 ++}
2113 ++
2114 ++/*
2115 ++ * Move request from internal lists to the request queue dispatch list.
2116 ++ */
2117 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2118 ++{
2119 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2120 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2121 ++
2122 ++ bfq_remove_request(rq);
2123 ++ bfqq->dispatched++;
2124 ++ elv_dispatch_sort(q, rq);
2125 ++
2126 ++ if (bfq_bfqq_sync(bfqq))
2127 ++ bfqd->sync_flight++;
2128 ++}
2129 ++
2130 ++/*
2131 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
2132 ++ */
2133 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2134 ++{
2135 ++ struct request *rq = NULL;
2136 ++
2137 ++ if (bfq_bfqq_fifo_expire(bfqq))
2138 ++ return NULL;
2139 ++
2140 ++ bfq_mark_bfqq_fifo_expire(bfqq);
2141 ++
2142 ++ if (list_empty(&bfqq->fifo))
2143 ++ return NULL;
2144 ++
2145 ++ rq = rq_entry_fifo(bfqq->fifo.next);
2146 ++
2147 ++ if (time_before(jiffies, rq_fifo_time(rq)))
2148 ++ return NULL;
2149 ++
2150 ++ return rq;
2151 ++}
2152 ++
2153 ++/*
2154 ++ * Must be called with the queue_lock held.
2155 ++ */
2156 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
2157 ++{
2158 ++ int process_refs, io_refs;
2159 ++
2160 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2161 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2162 ++ BUG_ON(process_refs < 0);
2163 ++ return process_refs;
2164 ++}
2165 ++
2166 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2167 ++{
2168 ++ int process_refs, new_process_refs;
2169 ++ struct bfq_queue *__bfqq;
2170 ++
2171 ++ /*
2172 ++ * If there are no process references on the new_bfqq, then it is
2173 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2174 ++ * may have dropped their last reference (not just their last process
2175 ++ * reference).
2176 ++ */
2177 ++ if (!bfqq_process_refs(new_bfqq))
2178 ++ return;
2179 ++
2180 ++ /* Avoid a circular list and skip interim queue merges. */
2181 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
2182 ++ if (__bfqq == bfqq)
2183 ++ return;
2184 ++ new_bfqq = __bfqq;
2185 ++ }
2186 ++
2187 ++ process_refs = bfqq_process_refs(bfqq);
2188 ++ new_process_refs = bfqq_process_refs(new_bfqq);
2189 ++ /*
2190 ++ * If the process for the bfqq has gone away, there is no
2191 ++ * sense in merging the queues.
2192 ++ */
2193 ++ if (process_refs == 0 || new_process_refs == 0)
2194 ++ return;
2195 ++
2196 ++ /*
2197 ++ * Merge in the direction of the lesser amount of work.
2198 ++ */
2199 ++ if (new_process_refs >= process_refs) {
2200 ++ bfqq->new_bfqq = new_bfqq;
2201 ++ atomic_add(process_refs, &new_bfqq->ref);
2202 ++ } else {
2203 ++ new_bfqq->new_bfqq = bfqq;
2204 ++ atomic_add(new_process_refs, &bfqq->ref);
2205 ++ }
2206 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2207 ++ new_bfqq->pid);
2208 ++}
2209 ++
2210 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2211 ++{
2212 ++ struct bfq_entity *entity = &bfqq->entity;
2213 ++ return entity->budget - entity->service;
2214 ++}
2215 ++
2216 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2217 ++{
2218 ++ BUG_ON(bfqq != bfqd->active_queue);
2219 ++
2220 ++ __bfq_bfqd_reset_active(bfqd);
2221 ++
2222 ++ /*
2223 ++ * If this bfqq is shared between multiple processes, check
2224 ++ * to make sure that those processes are still issuing I/Os
2225 ++ * within the mean seek distance. If not, it may be time to
2226 ++ * break the queues apart again.
2227 ++ */
2228 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2229 ++ bfq_mark_bfqq_split_coop(bfqq);
2230 ++
2231 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2232 ++ /*
2233 ++ * overloading budget_timeout field to store when
2234 ++ * the queue remains with no backlog, used by
2235 ++ * the weight-raising mechanism
2236 ++ */
2237 ++ bfqq->budget_timeout = jiffies ;
2238 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2239 ++ } else {
2240 ++ bfq_activate_bfqq(bfqd, bfqq);
2241 ++ /*
2242 ++ * Resort priority tree of potential close cooperators.
2243 ++ */
2244 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
2245 ++ }
2246 ++}
2247 ++
2248 ++/**
2249 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2250 ++ * @bfqd: device data.
2251 ++ * @bfqq: queue to update.
2252 ++ * @reason: reason for expiration.
2253 ++ *
2254 ++ * Handle the feedback on @bfqq budget. See the body for detailed
2255 ++ * comments.
2256 ++ */
2257 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2258 ++ struct bfq_queue *bfqq,
2259 ++ enum bfqq_expiration reason)
2260 ++{
2261 ++ struct request *next_rq;
2262 ++ unsigned long budget, min_budget;
2263 ++
2264 ++ budget = bfqq->max_budget;
2265 ++ min_budget = bfq_min_budget(bfqd);
2266 ++
2267 ++ BUG_ON(bfqq != bfqd->active_queue);
2268 ++
2269 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2270 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2271 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2272 ++ budget, bfq_min_budget(bfqd));
2273 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2274 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
2275 ++
2276 ++ if (bfq_bfqq_sync(bfqq)) {
2277 ++ switch (reason) {
2278 ++ /*
2279 ++ * Caveat: in all the following cases we trade latency
2280 ++ * for throughput.
2281 ++ */
2282 ++ case BFQ_BFQQ_TOO_IDLE:
2283 ++ /*
2284 ++ * This is the only case where we may reduce
2285 ++ * the budget: if there is no requets of the
2286 ++ * process still waiting for completion, then
2287 ++ * we assume (tentatively) that the timer has
2288 ++ * expired because the batch of requests of
2289 ++ * the process could have been served with a
2290 ++ * smaller budget. Hence, betting that
2291 ++ * process will behave in the same way when it
2292 ++ * becomes backlogged again, we reduce its
2293 ++ * next budget. As long as we guess right,
2294 ++ * this budget cut reduces the latency
2295 ++ * experienced by the process.
2296 ++ *
2297 ++ * However, if there are still outstanding
2298 ++ * requests, then the process may have not yet
2299 ++ * issued its next request just because it is
2300 ++ * still waiting for the completion of some of
2301 ++ * the still oustanding ones. So in this
2302 ++ * subcase we do not reduce its budget, on the
2303 ++ * contrary we increase it to possibly boost
2304 ++ * the throughput, as discussed in the
2305 ++ * comments to the BUDGET_TIMEOUT case.
2306 ++ */
2307 ++ if (bfqq->dispatched > 0) /* still oustanding reqs */
2308 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2309 ++ else {
2310 ++ if (budget > 5 * min_budget)
2311 ++ budget -= 4 * min_budget;
2312 ++ else
2313 ++ budget = min_budget;
2314 ++ }
2315 ++ break;
2316 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
2317 ++ /*
2318 ++ * We double the budget here because: 1) it
2319 ++ * gives the chance to boost the throughput if
2320 ++ * this is not a seeky process (which may have
2321 ++ * bumped into this timeout because of, e.g.,
2322 ++ * ZBR), 2) together with charge_full_budget
2323 ++ * it helps give seeky processes higher
2324 ++ * timestamps, and hence be served less
2325 ++ * frequently.
2326 ++ */
2327 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2328 ++ break;
2329 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2330 ++ /*
2331 ++ * The process still has backlog, and did not
2332 ++ * let either the budget timeout or the disk
2333 ++ * idling timeout expire. Hence it is not
2334 ++ * seeky, has a short thinktime and may be
2335 ++ * happy with a higher budget too. So
2336 ++ * definitely increase the budget of this good
2337 ++ * candidate to boost the disk throughput.
2338 ++ */
2339 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
2340 ++ break;
2341 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
2342 ++ /*
2343 ++ * Leave the budget unchanged.
2344 ++ */
2345 ++ default:
2346 ++ return;
2347 ++ }
2348 ++ } else /* async queue */
2349 ++ /* async queues get always the maximum possible budget
2350 ++ * (their ability to dispatch is limited by
2351 ++ * @bfqd->bfq_max_budget_async_rq).
2352 ++ */
2353 ++ budget = bfqd->bfq_max_budget;
2354 ++
2355 ++ bfqq->max_budget = budget;
2356 ++
2357 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2358 ++ bfqq->max_budget > bfqd->bfq_max_budget)
2359 ++ bfqq->max_budget = bfqd->bfq_max_budget;
2360 ++
2361 ++ /*
2362 ++ * Make sure that we have enough budget for the next request.
2363 ++ * Since the finish time of the bfqq must be kept in sync with
2364 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
2365 ++ * update.
2366 ++ */
2367 ++ next_rq = bfqq->next_rq;
2368 ++ if (next_rq != NULL)
2369 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2370 ++ bfq_serv_to_charge(next_rq, bfqq));
2371 ++ else
2372 ++ bfqq->entity.budget = bfqq->max_budget;
2373 ++
2374 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2375 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2376 ++ bfqq->entity.budget);
2377 ++}
2378 ++
2379 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2380 ++{
2381 ++ unsigned long max_budget;
2382 ++
2383 ++ /*
2384 ++ * The max_budget calculated when autotuning is equal to the
2385 ++ * amount of sectors transfered in timeout_sync at the
2386 ++ * estimated peak rate.
2387 ++ */
2388 ++ max_budget = (unsigned long)(peak_rate * 1000 *
2389 ++ timeout >> BFQ_RATE_SHIFT);
2390 ++
2391 ++ return max_budget;
2392 ++}
2393 ++
2394 ++/*
2395 ++ * In addition to updating the peak rate, checks whether the process
2396 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
2397 ++ * to the budget timeout, to reduce the amount of service provided to
2398 ++ * seeky processes, and hence reduce their chances to lower the
2399 ++ * throughput. See the code for more details.
2400 ++ */
2401 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2402 ++ int compensate, enum bfqq_expiration reason)
2403 ++{
2404 ++ u64 bw, usecs, expected, timeout;
2405 ++ ktime_t delta;
2406 ++ int update = 0;
2407 ++
2408 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2409 ++ return 0;
2410 ++
2411 ++ if (compensate)
2412 ++ delta = bfqd->last_idling_start;
2413 ++ else
2414 ++ delta = ktime_get();
2415 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
2416 ++ usecs = ktime_to_us(delta);
2417 ++
2418 ++ /* Don't trust short/unrealistic values. */
2419 ++ if (usecs < 100 || usecs >= LONG_MAX)
2420 ++ return 0;
2421 ++
2422 ++ /*
2423 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
2424 ++ * value to store the peak rate, in sectors per usec in fixed
2425 ++ * point math. We do so to have enough precision in the estimate
2426 ++ * and to avoid overflows.
2427 ++ */
2428 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2429 ++ do_div(bw, (unsigned long)usecs);
2430 ++
2431 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2432 ++
2433 ++ /*
2434 ++ * Use only long (> 20ms) intervals to filter out spikes for
2435 ++ * the peak rate estimation.
2436 ++ */
2437 ++ if (usecs > 20000) {
2438 ++ if (bw > bfqd->peak_rate ||
2439 ++ (!BFQQ_SEEKY(bfqq) &&
2440 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2441 ++ bfq_log(bfqd, "measured bw =%llu", bw);
2442 ++ /*
2443 ++ * To smooth oscillations use a low-pass filter with
2444 ++ * alpha=7/8, i.e.,
2445 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
2446 ++ */
2447 ++ do_div(bw, 8);
2448 ++ if (bw == 0)
2449 ++ return 0;
2450 ++ bfqd->peak_rate *= 7;
2451 ++ do_div(bfqd->peak_rate, 8);
2452 ++ bfqd->peak_rate += bw;
2453 ++ update = 1;
2454 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2455 ++ }
2456 ++
2457 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2458 ++
2459 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2460 ++ bfqd->peak_rate_samples++;
2461 ++
2462 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2463 ++ update && bfqd->bfq_user_max_budget == 0) {
2464 ++ bfqd->bfq_max_budget =
2465 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2466 ++ bfq_log(bfqd, "new max_budget=%lu",
2467 ++ bfqd->bfq_max_budget);
2468 ++ }
2469 ++ }
2470 ++
2471 ++ /*
2472 ++ * If the process has been served for a too short time
2473 ++ * interval to let its possible sequential accesses prevail on
2474 ++ * the initial seek time needed to move the disk head on the
2475 ++ * first sector it requested, then give the process a chance
2476 ++ * and for the moment return false.
2477 ++ */
2478 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2479 ++ return 0;
2480 ++
2481 ++ /*
2482 ++ * A process is considered ``slow'' (i.e., seeky, so that we
2483 ++ * cannot treat it fairly in the service domain, as it would
2484 ++ * slow down too much the other processes) if, when a slice
2485 ++ * ends for whatever reason, it has received service at a
2486 ++ * rate that would not be high enough to complete the budget
2487 ++ * before the budget timeout expiration.
2488 ++ */
2489 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2490 ++
2491 ++ /*
2492 ++ * Caveat: processes doing IO in the slower disk zones will
2493 ++ * tend to be slow(er) even if not seeky. And the estimated
2494 ++ * peak rate will actually be an average over the disk
2495 ++ * surface. Hence, to not be too harsh with unlucky processes,
2496 ++ * we keep a budget/3 margin of safety before declaring a
2497 ++ * process slow.
2498 ++ */
2499 ++ return expected > (4 * bfqq->entity.budget) / 3;
2500 ++}
2501 ++
2502 ++/**
2503 ++ * bfq_bfqq_expire - expire a queue.
2504 ++ * @bfqd: device owning the queue.
2505 ++ * @bfqq: the queue to expire.
2506 ++ * @compensate: if true, compensate for the time spent idling.
2507 ++ * @reason: the reason causing the expiration.
2508 ++ *
2509 ++ *
2510 ++ * If the process associated to the queue is slow (i.e., seeky), or in
2511 ++ * case of budget timeout, or, finally, if it is async, we
2512 ++ * artificially charge it an entire budget (independently of the
2513 ++ * actual service it received). As a consequence, the queue will get
2514 ++ * higher timestamps than the correct ones upon reactivation, and
2515 ++ * hence it will be rescheduled as if it had received more service
2516 ++ * than what it actually received. In the end, this class of processes
2517 ++ * will receive less service in proportion to how slowly they consume
2518 ++ * their budgets (and hence how seriously they tend to lower the
2519 ++ * throughput).
2520 ++ *
2521 ++ * In contrast, when a queue expires because it has been idling for
2522 ++ * too much or because it exhausted its budget, we do not touch the
2523 ++ * amount of service it has received. Hence when the queue will be
2524 ++ * reactivated and its timestamps updated, the latter will be in sync
2525 ++ * with the actual service received by the queue until expiration.
2526 ++ *
2527 ++ * Charging a full budget to the first type of queues and the exact
2528 ++ * service to the others has the effect of using the WF2Q+ policy to
2529 ++ * schedule the former on a timeslice basis, without violating the
2530 ++ * service domain guarantees of the latter.
2531 ++ */
2532 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
2533 ++ struct bfq_queue *bfqq,
2534 ++ int compensate,
2535 ++ enum bfqq_expiration reason)
2536 ++{
2537 ++ int slow;
2538 ++ BUG_ON(bfqq != bfqd->active_queue);
2539 ++
2540 ++ /* Update disk peak rate for autotuning and check whether the
2541 ++ * process is slow (see bfq_update_peak_rate).
2542 ++ */
2543 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2544 ++
2545 ++ /*
2546 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
2547 ++ * and async queues, to favor sequential sync workloads.
2548 ++ *
2549 ++ * Processes doing IO in the slower disk zones will tend to be
2550 ++ * slow(er) even if not seeky. Hence, since the estimated peak
2551 ++ * rate is actually an average over the disk surface, these
2552 ++ * processes may timeout just for bad luck. To avoid punishing
2553 ++ * them we do not charge a full budget to a process that
2554 ++ * succeeded in consuming at least 2/3 of its budget.
2555 ++ */
2556 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2557 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2558 ++ bfq_bfqq_charge_full_budget(bfqq);
2559 ++
2560 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2561 ++ bfqq->last_rais_start_finish = jiffies;
2562 ++
2563 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2564 ++ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
2565 ++ bfqq->soft_rt_next_start =
2566 ++ jiffies +
2567 ++ HZ * bfqq->entity.service /
2568 ++ bfqd->bfq_raising_max_softrt_rate;
2569 ++ else
2570 ++ bfqq->soft_rt_next_start = -1; /* infinity */
2571 ++ }
2572 ++ bfq_log_bfqq(bfqd, bfqq,
2573 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2574 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2575 ++
2576 ++ /* Increase, decrease or leave budget unchanged according to reason */
2577 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2578 ++ __bfq_bfqq_expire(bfqd, bfqq);
2579 ++}
2580 ++
2581 ++/*
2582 ++ * Budget timeout is not implemented through a dedicated timer, but
2583 ++ * just checked on request arrivals and completions, as well as on
2584 ++ * idle timer expirations.
2585 ++ */
2586 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2587 ++{
2588 ++ if (bfq_bfqq_budget_new(bfqq))
2589 ++ return 0;
2590 ++
2591 ++ if (time_before(jiffies, bfqq->budget_timeout))
2592 ++ return 0;
2593 ++
2594 ++ return 1;
2595 ++}
2596 ++
2597 ++/*
2598 ++ * If we expire a queue that is waiting for the arrival of a new
2599 ++ * request, we may prevent the fictitious timestamp backshifting that
2600 ++ * allows the guarantees of the queue to be preserved (see [1] for
2601 ++ * this tricky aspect). Hence we return true only if this condition
2602 ++ * does not hold, or if the queue is slow enough to deserve only to be
2603 ++ * kicked off for preserving a high throughput.
2604 ++*/
2605 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2606 ++{
2607 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
2608 ++ "may_budget_timeout: wr %d left %d timeout %d",
2609 ++ bfq_bfqq_wait_request(bfqq),
2610 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2611 ++ bfq_bfqq_budget_timeout(bfqq));
2612 ++
2613 ++ return (!bfq_bfqq_wait_request(bfqq) ||
2614 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2615 ++ &&
2616 ++ bfq_bfqq_budget_timeout(bfqq);
2617 ++}
2618 ++
2619 ++/*
2620 ++ * If the active queue is empty, but it is sync and either of the following
2621 ++ * conditions holds, then: 1) the queue must remain active and cannot be
2622 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
2623 ++ * of a new request for the queue. The conditions are:
2624 ++ * - the device is rotational and not performing NCQ, and the queue has its
2625 ++ * idle window set (in this case, waiting for a new request for the queue
2626 ++ * is likely to boost the disk throughput);
2627 ++ * - the queue is weight-raised (waiting for the request is necessary for
2628 ++ * providing the queue with fairness and latency guarantees).
2629 ++ */
2630 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
2631 ++ int budg_timeout)
2632 ++{
2633 ++ struct bfq_data *bfqd = bfqq->bfqd;
2634 ++
2635 ++ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
2636 ++ bfqd->bfq_slice_idle != 0 &&
2637 ++ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
2638 ++ !blk_queue_nonrot(bfqd->queue))
2639 ++ || bfqq->raising_coeff > 1) &&
2640 ++ (bfqd->rq_in_driver == 0 ||
2641 ++ budg_timeout ||
2642 ++ bfqq->raising_coeff > 1) &&
2643 ++ !bfq_close_cooperator(bfqd, bfqq) &&
2644 ++ (!bfq_bfqq_coop(bfqq) ||
2645 ++ !bfq_bfqq_some_coop_idle(bfqq)) &&
2646 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq));
2647 ++}
2648 ++
2649 ++/*
2650 ++ * Select a queue for service. If we have a current active queue,
2651 ++ * check whether to continue servicing it, or retrieve and set a new one.
2652 ++ */
2653 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2654 ++{
2655 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
2656 ++ struct request *next_rq;
2657 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2658 ++ int budg_timeout;
2659 ++
2660 ++ bfqq = bfqd->active_queue;
2661 ++ if (bfqq == NULL)
2662 ++ goto new_queue;
2663 ++
2664 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
2665 ++
2666 ++ /*
2667 ++ * If another queue has a request waiting within our mean seek
2668 ++ * distance, let it run. The expire code will check for close
2669 ++ * cooperators and put the close queue at the front of the
2670 ++ * service tree. If possible, merge the expiring queue with the
2671 ++ * new bfqq.
2672 ++ */
2673 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
2674 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
2675 ++ bfq_setup_merge(bfqq, new_bfqq);
2676 ++
2677 ++ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
2678 ++ if (budg_timeout &&
2679 ++ !bfq_bfqq_must_idle(bfqq, budg_timeout))
2680 ++ goto expire;
2681 ++
2682 ++ next_rq = bfqq->next_rq;
2683 ++ /*
2684 ++ * If bfqq has requests queued and it has enough budget left to
2685 ++ * serve them, keep the queue, otherwise expire it.
2686 ++ */
2687 ++ if (next_rq != NULL) {
2688 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
2689 ++ bfq_bfqq_budget_left(bfqq)) {
2690 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2691 ++ goto expire;
2692 ++ } else {
2693 ++ /*
2694 ++ * The idle timer may be pending because we may not
2695 ++ * disable disk idling even when a new request arrives
2696 ++ */
2697 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
2698 ++ /*
2699 ++ * If we get here: 1) at least a new request
2700 ++ * has arrived but we have not disabled the
2701 ++ * timer because the request was too small,
2702 ++ * 2) then the block layer has unplugged the
2703 ++ * device, causing the dispatch to be invoked.
2704 ++ *
2705 ++ * Since the device is unplugged, now the
2706 ++ * requests are probably large enough to
2707 ++ * provide a reasonable throughput.
2708 ++ * So we disable idling.
2709 ++ */
2710 ++ bfq_clear_bfqq_wait_request(bfqq);
2711 ++ del_timer(&bfqd->idle_slice_timer);
2712 ++ }
2713 ++ if (new_bfqq == NULL)
2714 ++ goto keep_queue;
2715 ++ else
2716 ++ goto expire;
2717 ++ }
2718 ++ }
2719 ++
2720 ++ /*
2721 ++ * No requests pending. If there is no cooperator, and the active
2722 ++ * queue still has requests in flight or is idling for a new request,
2723 ++ * then keep it.
2724 ++ */
2725 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
2726 ++ (bfqq->dispatched != 0 &&
2727 ++ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
2728 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
2729 ++ bfqq = NULL;
2730 ++ goto keep_queue;
2731 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
2732 ++ /*
2733 ++ * Expiring the queue because there is a close cooperator,
2734 ++ * cancel timer.
2735 ++ */
2736 ++ bfq_clear_bfqq_wait_request(bfqq);
2737 ++ del_timer(&bfqd->idle_slice_timer);
2738 ++ }
2739 ++
2740 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2741 ++expire:
2742 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2743 ++new_queue:
2744 ++ bfqq = bfq_set_active_queue(bfqd, new_bfqq);
2745 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
2746 ++ bfqq != NULL ? bfqq->pid : 0);
2747 ++keep_queue:
2748 ++ return bfqq;
2749 ++}
2750 ++
2751 ++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2752 ++{
2753 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
2754 ++ struct bfq_entity *entity = &bfqq->entity;
2755 ++
2756 ++ bfq_log_bfqq(bfqd, bfqq,
2757 ++ "raising period dur %u/%u msec, "
2758 ++ "old raising coeff %u, w %d(%d)",
2759 ++ jiffies_to_msecs(jiffies -
2760 ++ bfqq->last_rais_start_finish),
2761 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
2762 ++ bfqq->raising_coeff,
2763 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
2764 ++
2765 ++ BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
2766 ++ entity->orig_weight * bfqq->raising_coeff);
2767 ++ if(entity->ioprio_changed)
2768 ++ bfq_log_bfqq(bfqd, bfqq,
2769 ++ "WARN: pending prio change");
2770 ++ /*
2771 ++ * If too much time has elapsed from the beginning
2772 ++ * of this weight-raising period and process is not soft
2773 ++ * real-time, stop it
2774 ++ */
2775 ++ if (jiffies - bfqq->last_rais_start_finish >
2776 ++ bfqq->raising_cur_max_time) {
2777 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
2778 ++ bfqq->soft_rt_next_start < jiffies;
2779 ++
2780 ++ bfqq->last_rais_start_finish = jiffies;
2781 ++ if (soft_rt)
2782 ++ bfqq->raising_cur_max_time =
2783 ++ bfqd->bfq_raising_rt_max_time;
2784 ++ else {
2785 ++ bfq_log_bfqq(bfqd, bfqq,
2786 ++ "wrais ending at %llu msec,"
2787 ++ "rais_max_time %u",
2788 ++ bfqq->last_rais_start_finish,
2789 ++ jiffies_to_msecs(bfqq->
2790 ++ raising_cur_max_time));
2791 ++ bfq_bfqq_end_raising(bfqq);
2792 ++ __bfq_entity_update_weight_prio(
2793 ++ bfq_entity_service_tree(entity),
2794 ++ entity);
2795 ++ }
2796 ++ }
2797 ++ }
2798 ++}
2799 ++
2800 ++/*
2801 ++ * Dispatch one request from bfqq, moving it to the request queue
2802 ++ * dispatch list.
2803 ++ */
2804 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
2805 ++ struct bfq_queue *bfqq)
2806 ++{
2807 ++ int dispatched = 0;
2808 ++ struct request *rq;
2809 ++ unsigned long service_to_charge;
2810 ++
2811 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2812 ++
2813 ++ /* Follow expired path, else get first next available. */
2814 ++ rq = bfq_check_fifo(bfqq);
2815 ++ if (rq == NULL)
2816 ++ rq = bfqq->next_rq;
2817 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
2818 ++
2819 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2820 ++ /*
2821 ++ * This may happen if the next rq is chosen
2822 ++ * in fifo order instead of sector order.
2823 ++ * The budget is properly dimensioned
2824 ++ * to be always sufficient to serve the next request
2825 ++ * only if it is chosen in sector order. The reason is
2826 ++ * that it would be quite inefficient and little useful
2827 ++ * to always make sure that the budget is large enough
2828 ++ * to serve even the possible next rq in fifo order.
2829 ++ * In fact, requests are seldom served in fifo order.
2830 ++ *
2831 ++ * Expire the queue for budget exhaustion, and
2832 ++ * make sure that the next act_budget is enough
2833 ++ * to serve the next request, even if it comes
2834 ++ * from the fifo expired path.
2835 ++ */
2836 ++ bfqq->next_rq = rq;
2837 ++ /*
2838 ++ * Since this dispatch is failed, make sure that
2839 ++ * a new one will be performed
2840 ++ */
2841 ++ if (!bfqd->rq_in_driver)
2842 ++ bfq_schedule_dispatch(bfqd);
2843 ++ goto expire;
2844 ++ }
2845 ++
2846 ++ /* Finally, insert request into driver dispatch list. */
2847 ++ bfq_bfqq_served(bfqq, service_to_charge);
2848 ++ bfq_dispatch_insert(bfqd->queue, rq);
2849 ++
2850 ++ update_raising_data(bfqd, bfqq);
2851 ++
2852 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
2853 ++ "budg left %lu",
2854 ++ blk_rq_sectors(rq),
2855 ++ (long long unsigned)blk_rq_pos(rq),
2856 ++ bfq_bfqq_budget_left(bfqq));
2857 ++
2858 ++ dispatched++;
2859 ++
2860 ++ if (bfqd->active_bic == NULL) {
2861 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
2862 ++ bfqd->active_bic = RQ_BIC(rq);
2863 ++ }
2864 ++
2865 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2866 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
2867 ++ bfq_class_idle(bfqq)))
2868 ++ goto expire;
2869 ++
2870 ++ return dispatched;
2871 ++
2872 ++expire:
2873 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2874 ++ return dispatched;
2875 ++}
2876 ++
2877 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2878 ++{
2879 ++ int dispatched = 0;
2880 ++
2881 ++ while (bfqq->next_rq != NULL) {
2882 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2883 ++ dispatched++;
2884 ++ }
2885 ++
2886 ++ BUG_ON(!list_empty(&bfqq->fifo));
2887 ++ return dispatched;
2888 ++}
2889 ++
2890 ++/*
2891 ++ * Drain our current requests. Used for barriers and when switching
2892 ++ * io schedulers on-the-fly.
2893 ++ */
2894 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
2895 ++{
2896 ++ struct bfq_queue *bfqq, *n;
2897 ++ struct bfq_service_tree *st;
2898 ++ int dispatched = 0;
2899 ++
2900 ++ bfqq = bfqd->active_queue;
2901 ++ if (bfqq != NULL)
2902 ++ __bfq_bfqq_expire(bfqd, bfqq);
2903 ++
2904 ++ /*
2905 ++ * Loop through classes, and be careful to leave the scheduler
2906 ++ * in a consistent state, as feedback mechanisms and vtime
2907 ++ * updates cannot be disabled during the process.
2908 ++ */
2909 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
2910 ++ st = bfq_entity_service_tree(&bfqq->entity);
2911 ++
2912 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
2913 ++ bfqq->max_budget = bfq_max_budget(bfqd);
2914 ++
2915 ++ bfq_forget_idle(st);
2916 ++ }
2917 ++
2918 ++ BUG_ON(bfqd->busy_queues != 0);
2919 ++
2920 ++ return dispatched;
2921 ++}
2922 ++
2923 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
2924 ++{
2925 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2926 ++ struct bfq_queue *bfqq;
2927 ++ int max_dispatch;
2928 ++
2929 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
2930 ++ if (bfqd->busy_queues == 0)
2931 ++ return 0;
2932 ++
2933 ++ if (unlikely(force))
2934 ++ return bfq_forced_dispatch(bfqd);
2935 ++
2936 ++ if((bfqq = bfq_select_queue(bfqd)) == NULL)
2937 ++ return 0;
2938 ++
2939 ++ max_dispatch = bfqd->bfq_quantum;
2940 ++ if (bfq_class_idle(bfqq))
2941 ++ max_dispatch = 1;
2942 ++
2943 ++ if (!bfq_bfqq_sync(bfqq))
2944 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
2945 ++
2946 ++ if (bfqq->dispatched >= max_dispatch) {
2947 ++ if (bfqd->busy_queues > 1)
2948 ++ return 0;
2949 ++ if (bfqq->dispatched >= 4 * max_dispatch)
2950 ++ return 0;
2951 ++ }
2952 ++
2953 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
2954 ++ return 0;
2955 ++
2956 ++ bfq_clear_bfqq_wait_request(bfqq);
2957 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2958 ++
2959 ++ if (! bfq_dispatch_request(bfqd, bfqq))
2960 ++ return 0;
2961 ++
2962 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
2963 ++ "(max_disp %d)", bfqq->pid, max_dispatch);
2964 ++
2965 ++ return 1;
2966 ++}
2967 ++
2968 ++/*
2969 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
2970 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
2971 ++ *
2972 ++ * Queue lock must be held here.
2973 ++ */
2974 ++static void bfq_put_queue(struct bfq_queue *bfqq)
2975 ++{
2976 ++ struct bfq_data *bfqd = bfqq->bfqd;
2977 ++
2978 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
2979 ++
2980 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
2981 ++ atomic_read(&bfqq->ref));
2982 ++ if (!atomic_dec_and_test(&bfqq->ref))
2983 ++ return;
2984 ++
2985 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
2986 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
2987 ++ BUG_ON(bfqq->entity.tree != NULL);
2988 ++ BUG_ON(bfq_bfqq_busy(bfqq));
2989 ++ BUG_ON(bfqd->active_queue == bfqq);
2990 ++
2991 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
2992 ++
2993 ++ kmem_cache_free(bfq_pool, bfqq);
2994 ++}
2995 ++
2996 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
2997 ++{
2998 ++ struct bfq_queue *__bfqq, *next;
2999 ++
3000 ++ /*
3001 ++ * If this queue was scheduled to merge with another queue, be
3002 ++ * sure to drop the reference taken on that queue (and others in
3003 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3004 ++ */
3005 ++ __bfqq = bfqq->new_bfqq;
3006 ++ while (__bfqq) {
3007 ++ if (__bfqq == bfqq) {
3008 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
3009 ++ break;
3010 ++ }
3011 ++ next = __bfqq->new_bfqq;
3012 ++ bfq_put_queue(__bfqq);
3013 ++ __bfqq = next;
3014 ++ }
3015 ++}
3016 ++
3017 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3018 ++{
3019 ++ if (bfqq == bfqd->active_queue) {
3020 ++ __bfq_bfqq_expire(bfqd, bfqq);
3021 ++ bfq_schedule_dispatch(bfqd);
3022 ++ }
3023 ++
3024 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3025 ++ atomic_read(&bfqq->ref));
3026 ++
3027 ++ bfq_put_cooperator(bfqq);
3028 ++
3029 ++ bfq_put_queue(bfqq);
3030 ++}
3031 ++
3032 ++static void bfq_init_icq(struct io_cq *icq)
3033 ++{
3034 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3035 ++
3036 ++ bic->ttime.last_end_request = jiffies;
3037 ++}
3038 ++
3039 ++static void bfq_exit_icq(struct io_cq *icq)
3040 ++{
3041 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3042 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
3043 ++
3044 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
3045 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3046 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
3047 ++ }
3048 ++
3049 ++ if (bic->bfqq[BLK_RW_SYNC]) {
3050 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3051 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
3052 ++ }
3053 ++}
3054 ++
3055 ++/*
3056 ++ * Update the entity prio values; note that the new values will not
3057 ++ * be used until the next (re)activation.
3058 ++ */
3059 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3060 ++{
3061 ++ struct task_struct *tsk = current;
3062 ++ int ioprio_class;
3063 ++
3064 ++ if (!bfq_bfqq_prio_changed(bfqq))
3065 ++ return;
3066 ++
3067 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3068 ++ switch (ioprio_class) {
3069 ++ default:
3070 ++ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
3071 ++ case IOPRIO_CLASS_NONE:
3072 ++ /*
3073 ++ * No prio set, inherit CPU scheduling settings.
3074 ++ */
3075 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3076 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3077 ++ break;
3078 ++ case IOPRIO_CLASS_RT:
3079 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3080 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3081 ++ break;
3082 ++ case IOPRIO_CLASS_BE:
3083 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3084 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3085 ++ break;
3086 ++ case IOPRIO_CLASS_IDLE:
3087 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3088 ++ bfqq->entity.new_ioprio = 7;
3089 ++ bfq_clear_bfqq_idle_window(bfqq);
3090 ++ break;
3091 ++ }
3092 ++
3093 ++ bfqq->entity.ioprio_changed = 1;
3094 ++
3095 ++ /*
3096 ++ * Keep track of original prio settings in case we have to temporarily
3097 ++ * elevate the priority of this queue.
3098 ++ */
3099 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3100 ++ bfq_clear_bfqq_prio_changed(bfqq);
3101 ++}
3102 ++
3103 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3104 ++{
3105 ++ struct bfq_data *bfqd;
3106 ++ struct bfq_queue *bfqq, *new_bfqq;
3107 ++ struct bfq_group *bfqg;
3108 ++ unsigned long uninitialized_var(flags);
3109 ++ int ioprio = bic->icq.ioc->ioprio;
3110 ++
3111 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
3112 ++ /*
3113 ++ * This condition may trigger on a newly created bic, be sure to drop the
3114 ++ * lock before returning.
3115 ++ */
3116 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3117 ++ goto out;
3118 ++
3119 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
3120 ++ if (bfqq != NULL) {
3121 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3122 ++ sched_data);
3123 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3124 ++ GFP_ATOMIC);
3125 ++ if (new_bfqq != NULL) {
3126 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3127 ++ bfq_log_bfqq(bfqd, bfqq,
3128 ++ "changed_ioprio: bfqq %p %d",
3129 ++ bfqq, atomic_read(&bfqq->ref));
3130 ++ bfq_put_queue(bfqq);
3131 ++ }
3132 ++ }
3133 ++
3134 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
3135 ++ if (bfqq != NULL)
3136 ++ bfq_mark_bfqq_prio_changed(bfqq);
3137 ++
3138 ++ bic->ioprio = ioprio;
3139 ++
3140 ++out:
3141 ++ bfq_put_bfqd_unlock(bfqd, &flags);
3142 ++}
3143 ++
3144 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3145 ++ pid_t pid, int is_sync)
3146 ++{
3147 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3148 ++ INIT_LIST_HEAD(&bfqq->fifo);
3149 ++
3150 ++ atomic_set(&bfqq->ref, 0);
3151 ++ bfqq->bfqd = bfqd;
3152 ++
3153 ++ bfq_mark_bfqq_prio_changed(bfqq);
3154 ++
3155 ++ if (is_sync) {
3156 ++ if (!bfq_class_idle(bfqq))
3157 ++ bfq_mark_bfqq_idle_window(bfqq);
3158 ++ bfq_mark_bfqq_sync(bfqq);
3159 ++ }
3160 ++
3161 ++ /* Tentative initial value to trade off between thr and lat */
3162 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3163 ++ bfqq->pid = pid;
3164 ++
3165 ++ bfqq->raising_coeff = 1;
3166 ++ bfqq->last_rais_start_finish = 0;
3167 ++ bfqq->soft_rt_next_start = -1;
3168 ++}
3169 ++
3170 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3171 ++ struct bfq_group *bfqg,
3172 ++ int is_sync,
3173 ++ struct bfq_io_cq *bic,
3174 ++ gfp_t gfp_mask)
3175 ++{
3176 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
3177 ++
3178 ++retry:
3179 ++ /* bic always exists here */
3180 ++ bfqq = bic_to_bfqq(bic, is_sync);
3181 ++
3182 ++ /*
3183 ++ * Always try a new alloc if we fall back to the OOM bfqq
3184 ++ * originally, since it should just be a temporary situation.
3185 ++ */
3186 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3187 ++ bfqq = NULL;
3188 ++ if (new_bfqq != NULL) {
3189 ++ bfqq = new_bfqq;
3190 ++ new_bfqq = NULL;
3191 ++ } else if (gfp_mask & __GFP_WAIT) {
3192 ++ spin_unlock_irq(bfqd->queue->queue_lock);
3193 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3194 ++ gfp_mask | __GFP_ZERO,
3195 ++ bfqd->queue->node);
3196 ++ spin_lock_irq(bfqd->queue->queue_lock);
3197 ++ if (new_bfqq != NULL)
3198 ++ goto retry;
3199 ++ } else {
3200 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
3201 ++ gfp_mask | __GFP_ZERO,
3202 ++ bfqd->queue->node);
3203 ++ }
3204 ++
3205 ++ if (bfqq != NULL) {
3206 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3207 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
3208 ++ } else {
3209 ++ bfqq = &bfqd->oom_bfqq;
3210 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3211 ++ }
3212 ++
3213 ++ bfq_init_prio_data(bfqq, bic);
3214 ++ bfq_init_entity(&bfqq->entity, bfqg);
3215 ++ }
3216 ++
3217 ++ if (new_bfqq != NULL)
3218 ++ kmem_cache_free(bfq_pool, new_bfqq);
3219 ++
3220 ++ return bfqq;
3221 ++}
3222 ++
3223 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3224 ++ struct bfq_group *bfqg,
3225 ++ int ioprio_class, int ioprio)
3226 ++{
3227 ++ switch (ioprio_class) {
3228 ++ case IOPRIO_CLASS_RT:
3229 ++ return &bfqg->async_bfqq[0][ioprio];
3230 ++ case IOPRIO_CLASS_NONE:
3231 ++ ioprio = IOPRIO_NORM;
3232 ++ /* fall through */
3233 ++ case IOPRIO_CLASS_BE:
3234 ++ return &bfqg->async_bfqq[1][ioprio];
3235 ++ case IOPRIO_CLASS_IDLE:
3236 ++ return &bfqg->async_idle_bfqq;
3237 ++ default:
3238 ++ BUG();
3239 ++ }
3240 ++}
3241 ++
3242 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3243 ++ struct bfq_group *bfqg, int is_sync,
3244 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
3245 ++{
3246 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3247 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3248 ++ struct bfq_queue **async_bfqq = NULL;
3249 ++ struct bfq_queue *bfqq = NULL;
3250 ++
3251 ++ if (!is_sync) {
3252 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3253 ++ ioprio);
3254 ++ bfqq = *async_bfqq;
3255 ++ }
3256 ++
3257 ++ if (bfqq == NULL)
3258 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3259 ++
3260 ++ /*
3261 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
3262 ++ */
3263 ++ if (!is_sync && *async_bfqq == NULL) {
3264 ++ atomic_inc(&bfqq->ref);
3265 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3266 ++ bfqq, atomic_read(&bfqq->ref));
3267 ++ *async_bfqq = bfqq;
3268 ++ }
3269 ++
3270 ++ atomic_inc(&bfqq->ref);
3271 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3272 ++ atomic_read(&bfqq->ref));
3273 ++ return bfqq;
3274 ++}
3275 ++
3276 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3277 ++ struct bfq_io_cq *bic)
3278 ++{
3279 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3280 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3281 ++
3282 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3283 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3284 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
3285 ++}
3286 ++
3287 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
3288 ++ struct bfq_queue *bfqq,
3289 ++ struct request *rq)
3290 ++{
3291 ++ sector_t sdist;
3292 ++ u64 total;
3293 ++
3294 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
3295 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3296 ++ else
3297 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3298 ++
3299 ++ /*
3300 ++ * Don't allow the seek distance to get too large from the
3301 ++ * odd fragment, pagein, etc.
3302 ++ */
3303 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3304 ++ sdist = 0;
3305 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
3306 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3307 ++ else
3308 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3309 ++
3310 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3311 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3312 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
3313 ++ do_div(total, bfqq->seek_samples);
3314 ++ if (bfq_bfqq_coop(bfqq)) {
3315 ++ /*
3316 ++ * If the mean seektime increases for a (non-seeky) shared
3317 ++ * queue, some cooperator is likely to be idling too much.
3318 ++ * On the contrary, if it decreases, some cooperator has
3319 ++ * probably waked up.
3320 ++ *
3321 ++ */
3322 ++ if ((sector_t)total < bfqq->seek_mean)
3323 ++ bfq_mark_bfqq_some_coop_idle(bfqq) ;
3324 ++ else if ((sector_t)total > bfqq->seek_mean)
3325 ++ bfq_clear_bfqq_some_coop_idle(bfqq) ;
3326 ++ }
3327 ++ bfqq->seek_mean = (sector_t)total;
3328 ++
3329 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3330 ++ (u64)bfqq->seek_mean);
3331 ++}
3332 ++
3333 ++/*
3334 ++ * Disable idle window if the process thinks too long or seeks so much that
3335 ++ * it doesn't matter.
3336 ++ */
3337 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
3338 ++ struct bfq_queue *bfqq,
3339 ++ struct bfq_io_cq *bic)
3340 ++{
3341 ++ int enable_idle;
3342 ++
3343 ++ /* Don't idle for async or idle io prio class. */
3344 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3345 ++ return;
3346 ++
3347 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
3348 ++
3349 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3350 ++ bfqd->bfq_slice_idle == 0 ||
3351 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3352 ++ bfqq->raising_coeff == 1))
3353 ++ enable_idle = 0;
3354 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3355 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3356 ++ bfqq->raising_coeff == 1)
3357 ++ enable_idle = 0;
3358 ++ else
3359 ++ enable_idle = 1;
3360 ++ }
3361 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3362 ++ enable_idle);
3363 ++
3364 ++ if (enable_idle)
3365 ++ bfq_mark_bfqq_idle_window(bfqq);
3366 ++ else
3367 ++ bfq_clear_bfqq_idle_window(bfqq);
3368 ++}
3369 ++
3370 ++/*
3371 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
3372 ++ * something we should do about it.
3373 ++ */
3374 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3375 ++ struct request *rq)
3376 ++{
3377 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
3378 ++
3379 ++ if (rq->cmd_flags & REQ_META)
3380 ++ bfqq->meta_pending++;
3381 ++
3382 ++ bfq_update_io_thinktime(bfqd, bic);
3383 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
3384 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3385 ++ !BFQQ_SEEKY(bfqq))
3386 ++ bfq_update_idle_window(bfqd, bfqq, bic);
3387 ++
3388 ++ bfq_log_bfqq(bfqd, bfqq,
3389 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3390 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3391 ++ (long long unsigned)bfqq->seek_mean);
3392 ++
3393 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3394 ++
3395 ++ if (bfqq == bfqd->active_queue) {
3396 ++ /*
3397 ++ * If there is just this request queued and the request
3398 ++ * is small, just exit.
3399 ++ * In this way, if the disk is being idled to wait for a new
3400 ++ * request from the active queue, we avoid unplugging the
3401 ++ * device now.
3402 ++ *
3403 ++ * By doing so, we spare the disk to be committed
3404 ++ * to serve just a small request. On the contrary, we wait for
3405 ++ * the block layer to decide when to unplug the device:
3406 ++ * hopefully, new requests will be merged to this
3407 ++ * one quickly, then the device will be unplugged
3408 ++ * and larger requests will be dispatched.
3409 ++ */
3410 ++ if (bfqq->queued[rq_is_sync(rq)] == 1 &&
3411 ++ blk_rq_sectors(rq) < 32) {
3412 ++ return;
3413 ++ }
3414 ++ if (bfq_bfqq_wait_request(bfqq)) {
3415 ++ /*
3416 ++ * If we are waiting for a request for this queue, let
3417 ++ * it rip immediately and flag that we must not expire
3418 ++ * this queue just now.
3419 ++ */
3420 ++ bfq_clear_bfqq_wait_request(bfqq);
3421 ++ del_timer(&bfqd->idle_slice_timer);
3422 ++ /*
3423 ++ * Here we can safely expire the queue, in
3424 ++ * case of budget timeout, without wasting
3425 ++ * guarantees
3426 ++ */
3427 ++ if (bfq_bfqq_budget_timeout(bfqq))
3428 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
3429 ++ BFQ_BFQQ_BUDGET_TIMEOUT);
3430 ++ __blk_run_queue(bfqd->queue);
3431 ++ }
3432 ++ }
3433 ++}
3434 ++
3435 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
3436 ++{
3437 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3438 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3439 ++
3440 ++ assert_spin_locked(bfqd->queue->queue_lock);
3441 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3442 ++
3443 ++ bfq_add_rq_rb(rq);
3444 ++
3445 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3446 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
3447 ++
3448 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
3449 ++}
3450 ++
3451 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
3452 ++{
3453 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3454 ++ bfqd->rq_in_driver);
3455 ++
3456 ++ if (bfqd->hw_tag == 1)
3457 ++ return;
3458 ++
3459 ++ /*
3460 ++ * This sample is valid if the number of outstanding requests
3461 ++ * is large enough to allow a queueing behavior. Note that the
3462 ++ * sum is not exact, as it's not taking into account deactivated
3463 ++ * requests.
3464 ++ */
3465 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3466 ++ return;
3467 ++
3468 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3469 ++ return;
3470 ++
3471 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3472 ++ bfqd->max_rq_in_driver = 0;
3473 ++ bfqd->hw_tag_samples = 0;
3474 ++}
3475 ++
3476 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
3477 ++{
3478 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3479 ++ struct bfq_data *bfqd = bfqq->bfqd;
3480 ++ const int sync = rq_is_sync(rq);
3481 ++
3482 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3483 ++ blk_rq_sectors(rq), sync);
3484 ++
3485 ++ bfq_update_hw_tag(bfqd);
3486 ++
3487 ++ WARN_ON(!bfqd->rq_in_driver);
3488 ++ WARN_ON(!bfqq->dispatched);
3489 ++ bfqd->rq_in_driver--;
3490 ++ bfqq->dispatched--;
3491 ++
3492 ++ if (bfq_bfqq_sync(bfqq))
3493 ++ bfqd->sync_flight--;
3494 ++
3495 ++ if (sync)
3496 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3497 ++
3498 ++ /*
3499 ++ * If this is the active queue, check if it needs to be expired,
3500 ++ * or if we want to idle in case it has no pending requests.
3501 ++ */
3502 ++ if (bfqd->active_queue == bfqq) {
3503 ++ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
3504 ++ if (bfq_bfqq_budget_new(bfqq))
3505 ++ bfq_set_budget_timeout(bfqd);
3506 ++
3507 ++ /* Idling is disabled also for cooperation issues:
3508 ++ * 1) there is a close cooperator for the queue, or
3509 ++ * 2) the queue is shared and some cooperator is likely
3510 ++ * to be idle (in this case, by not arming the idle timer,
3511 ++ * we try to slow down the queue, to prevent the zones
3512 ++ * of the disk accessed by the active cooperators to become
3513 ++ * too distant from the zone that will be accessed by the
3514 ++ * currently idle cooperators)
3515 ++ */
3516 ++ if (bfq_bfqq_must_idle(bfqq, budg_timeout))
3517 ++ bfq_arm_slice_timer(bfqd);
3518 ++ else if (budg_timeout)
3519 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3520 ++ }
3521 ++
3522 ++ if (!bfqd->rq_in_driver)
3523 ++ bfq_schedule_dispatch(bfqd);
3524 ++}
3525 ++
3526 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3527 ++{
3528 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3529 ++ bfq_clear_bfqq_must_alloc(bfqq);
3530 ++ return ELV_MQUEUE_MUST;
3531 ++ }
3532 ++
3533 ++ return ELV_MQUEUE_MAY;
3534 ++}
3535 ++
3536 ++static int bfq_may_queue(struct request_queue *q, int rw)
3537 ++{
3538 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3539 ++ struct task_struct *tsk = current;
3540 ++ struct bfq_io_cq *bic;
3541 ++ struct bfq_queue *bfqq;
3542 ++
3543 ++ /*
3544 ++ * Don't force setup of a queue from here, as a call to may_queue
3545 ++ * does not necessarily imply that a request actually will be queued.
3546 ++ * So just lookup a possibly existing queue, or return 'may queue'
3547 ++ * if that fails.
3548 ++ */
3549 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3550 ++ if (bic == NULL)
3551 ++ return ELV_MQUEUE_MAY;
3552 ++
3553 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3554 ++ if (bfqq != NULL) {
3555 ++ bfq_init_prio_data(bfqq, bic);
3556 ++
3557 ++ return __bfq_may_queue(bfqq);
3558 ++ }
3559 ++
3560 ++ return ELV_MQUEUE_MAY;
3561 ++}
3562 ++
3563 ++/*
3564 ++ * Queue lock held here.
3565 ++ */
3566 ++static void bfq_put_request(struct request *rq)
3567 ++{
3568 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3569 ++
3570 ++ if (bfqq != NULL) {
3571 ++ const int rw = rq_data_dir(rq);
3572 ++
3573 ++ BUG_ON(!bfqq->allocated[rw]);
3574 ++ bfqq->allocated[rw]--;
3575 ++
3576 ++ rq->elv.priv[0] = NULL;
3577 ++ rq->elv.priv[1] = NULL;
3578 ++
3579 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3580 ++ bfqq, atomic_read(&bfqq->ref));
3581 ++ bfq_put_queue(bfqq);
3582 ++ }
3583 ++}
3584 ++
3585 ++static struct bfq_queue *
3586 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3587 ++ struct bfq_queue *bfqq)
3588 ++{
3589 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3590 ++ (long unsigned)bfqq->new_bfqq->pid);
3591 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3592 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3593 ++ bfq_put_queue(bfqq);
3594 ++ return bic_to_bfqq(bic, 1);
3595 ++}
3596 ++
3597 ++/*
3598 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3599 ++ * was the last process referring to said bfqq.
3600 ++ */
3601 ++static struct bfq_queue *
3602 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3603 ++{
3604 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3605 ++ if (bfqq_process_refs(bfqq) == 1) {
3606 ++ bfqq->pid = current->pid;
3607 ++ bfq_clear_bfqq_some_coop_idle(bfqq);
3608 ++ bfq_clear_bfqq_coop(bfqq);
3609 ++ bfq_clear_bfqq_split_coop(bfqq);
3610 ++ return bfqq;
3611 ++ }
3612 ++
3613 ++ bic_set_bfqq(bic, NULL, 1);
3614 ++
3615 ++ bfq_put_cooperator(bfqq);
3616 ++
3617 ++ bfq_put_queue(bfqq);
3618 ++ return NULL;
3619 ++}
3620 ++
3621 ++/*
3622 ++ * Allocate bfq data structures associated with this request.
3623 ++ */
3624 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
3625 ++ struct bio *bio, gfp_t gfp_mask)
3626 ++{
3627 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3628 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3629 ++ const int rw = rq_data_dir(rq);
3630 ++ const int is_sync = rq_is_sync(rq);
3631 ++ struct bfq_queue *bfqq;
3632 ++ struct bfq_group *bfqg;
3633 ++ unsigned long flags;
3634 ++
3635 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
3636 ++
3637 ++ bfq_changed_ioprio(bic);
3638 ++
3639 ++ spin_lock_irqsave(q->queue_lock, flags);
3640 ++
3641 ++ if (bic == NULL)
3642 ++ goto queue_fail;
3643 ++
3644 ++ bfqg = bfq_bic_update_cgroup(bic);
3645 ++
3646 ++new_queue:
3647 ++ bfqq = bic_to_bfqq(bic, is_sync);
3648 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3649 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3650 ++ bic_set_bfqq(bic, bfqq, is_sync);
3651 ++ } else {
3652 ++ /*
3653 ++ * If the queue was seeky for too long, break it apart.
3654 ++ */
3655 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3656 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3657 ++ bfqq = bfq_split_bfqq(bic, bfqq);
3658 ++ if (!bfqq)
3659 ++ goto new_queue;
3660 ++ }
3661 ++
3662 ++ /*
3663 ++ * Check to see if this queue is scheduled to merge with
3664 ++ * another closely cooperating queue. The merging of queues
3665 ++ * happens here as it must be done in process context.
3666 ++ * The reference on new_bfqq was taken in merge_bfqqs.
3667 ++ */
3668 ++ if (bfqq->new_bfqq != NULL)
3669 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
3670 ++ }
3671 ++
3672 ++ bfqq->allocated[rw]++;
3673 ++ atomic_inc(&bfqq->ref);
3674 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3675 ++ atomic_read(&bfqq->ref));
3676 ++
3677 ++ rq->elv.priv[0] = bic;
3678 ++ rq->elv.priv[1] = bfqq;
3679 ++
3680 ++ spin_unlock_irqrestore(q->queue_lock, flags);
3681 ++
3682 ++ return 0;
3683 ++
3684 ++queue_fail:
3685 ++ bfq_schedule_dispatch(bfqd);
3686 ++ spin_unlock_irqrestore(q->queue_lock, flags);
3687 ++
3688 ++ return 1;
3689 ++}
3690 ++
3691 ++static void bfq_kick_queue(struct work_struct *work)
3692 ++{
3693 ++ struct bfq_data *bfqd =
3694 ++ container_of(work, struct bfq_data, unplug_work);
3695 ++ struct request_queue *q = bfqd->queue;
3696 ++
3697 ++ spin_lock_irq(q->queue_lock);
3698 ++ __blk_run_queue(q);
3699 ++ spin_unlock_irq(q->queue_lock);
3700 ++}
3701 ++
3702 ++/*
3703 ++ * Handler of the expiration of the timer running if the active_queue
3704 ++ * is idling inside its time slice.
3705 ++ */
3706 ++static void bfq_idle_slice_timer(unsigned long data)
3707 ++{
3708 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
3709 ++ struct bfq_queue *bfqq;
3710 ++ unsigned long flags;
3711 ++ enum bfqq_expiration reason;
3712 ++
3713 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3714 ++
3715 ++ bfqq = bfqd->active_queue;
3716 ++ /*
3717 ++ * Theoretical race here: active_queue can be NULL or different
3718 ++ * from the queue that was idling if the timer handler spins on
3719 ++ * the queue_lock and a new request arrives for the current
3720 ++ * queue and there is a full dispatch cycle that changes the
3721 ++ * active_queue. This can hardly happen, but in the worst case
3722 ++ * we just expire a queue too early.
3723 ++ */
3724 ++ if (bfqq != NULL) {
3725 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3726 ++ if (bfq_bfqq_budget_timeout(bfqq))
3727 ++ /*
3728 ++ * Also here the queue can be safely expired
3729 ++ * for budget timeout without wasting
3730 ++ * guarantees
3731 ++ */
3732 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3733 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3734 ++ /*
3735 ++ * The queue may not be empty upon timer expiration,
3736 ++ * because we may not disable the timer when the first
3737 ++ * request of the active queue arrives during
3738 ++ * disk idling
3739 ++ */
3740 ++ reason = BFQ_BFQQ_TOO_IDLE;
3741 ++ else
3742 ++ goto schedule_dispatch;
3743 ++
3744 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3745 ++ }
3746 ++
3747 ++schedule_dispatch:
3748 ++ bfq_schedule_dispatch(bfqd);
3749 ++
3750 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3751 ++}
3752 ++
3753 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3754 ++{
3755 ++ del_timer_sync(&bfqd->idle_slice_timer);
3756 ++ cancel_work_sync(&bfqd->unplug_work);
3757 ++}
3758 ++
3759 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3760 ++ struct bfq_queue **bfqq_ptr)
3761 ++{
3762 ++ struct bfq_group *root_group = bfqd->root_group;
3763 ++ struct bfq_queue *bfqq = *bfqq_ptr;
3764 ++
3765 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3766 ++ if (bfqq != NULL) {
3767 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3768 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3769 ++ bfqq, atomic_read(&bfqq->ref));
3770 ++ bfq_put_queue(bfqq);
3771 ++ *bfqq_ptr = NULL;
3772 ++ }
3773 ++}
3774 ++
3775 ++/*
3776 ++ * Release all the bfqg references to its async queues. If we are
3777 ++ * deallocating the group these queues may still contain requests, so
3778 ++ * we reparent them to the root cgroup (i.e., the only one that will
3779 ++ * exist for sure untill all the requests on a device are gone).
3780 ++ */
3781 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3782 ++{
3783 ++ int i, j;
3784 ++
3785 ++ for (i = 0; i < 2; i++)
3786 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
3787 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3788 ++
3789 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3790 ++}
3791 ++
3792 ++static void bfq_exit_queue(struct elevator_queue *e)
3793 ++{
3794 ++ struct bfq_data *bfqd = e->elevator_data;
3795 ++ struct request_queue *q = bfqd->queue;
3796 ++ struct bfq_queue *bfqq, *n;
3797 ++
3798 ++ bfq_shutdown_timer_wq(bfqd);
3799 ++
3800 ++ spin_lock_irq(q->queue_lock);
3801 ++
3802 ++ BUG_ON(bfqd->active_queue != NULL);
3803 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3804 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
3805 ++
3806 ++ bfq_disconnect_groups(bfqd);
3807 ++ spin_unlock_irq(q->queue_lock);
3808 ++
3809 ++ bfq_shutdown_timer_wq(bfqd);
3810 ++
3811 ++ synchronize_rcu();
3812 ++
3813 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3814 ++
3815 ++ bfq_free_root_group(bfqd);
3816 ++ kfree(bfqd);
3817 ++}
3818 ++
3819 ++static int bfq_init_queue(struct request_queue *q)
3820 ++{
3821 ++ struct bfq_group *bfqg;
3822 ++ struct bfq_data *bfqd;
3823 ++
3824 ++ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3825 ++ if (bfqd == NULL)
3826 ++ return -ENOMEM;
3827 ++
3828 ++ /*
3829 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3830 ++ * Grab a permanent reference to it, so that the normal code flow
3831 ++ * will not attempt to free it.
3832 ++ */
3833 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
3834 ++ atomic_inc(&bfqd->oom_bfqq.ref);
3835 ++
3836 ++ bfqd->queue = q;
3837 ++ q->elevator->elevator_data = bfqd;
3838 ++
3839 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
3840 ++ if (bfqg == NULL) {
3841 ++ kfree(bfqd);
3842 ++ return -ENOMEM;
3843 ++ }
3844 ++
3845 ++ bfqd->root_group = bfqg;
3846 ++
3847 ++ init_timer(&bfqd->idle_slice_timer);
3848 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3849 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
3850 ++
3851 ++ bfqd->rq_pos_tree = RB_ROOT;
3852 ++
3853 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
3854 ++
3855 ++ INIT_LIST_HEAD(&bfqd->active_list);
3856 ++ INIT_LIST_HEAD(&bfqd->idle_list);
3857 ++
3858 ++ bfqd->hw_tag = -1;
3859 ++
3860 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
3861 ++
3862 ++ bfqd->bfq_quantum = bfq_quantum;
3863 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3864 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3865 ++ bfqd->bfq_back_max = bfq_back_max;
3866 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
3867 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
3868 ++ bfqd->bfq_class_idle_last_service = 0;
3869 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
3870 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
3871 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
3872 ++
3873 ++ bfqd->low_latency = true;
3874 ++
3875 ++ bfqd->bfq_raising_coeff = 20;
3876 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
3877 ++ bfqd->bfq_raising_max_time = 0;
3878 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
3879 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
3880 ++ bfqd->bfq_raising_max_softrt_rate = 7000;
3881 ++
3882 ++ /* Initially estimate the device's peak rate as the reference rate */
3883 ++ if (blk_queue_nonrot(bfqd->queue)) {
3884 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
3885 ++ bfqd->peak_rate = R_nonrot;
3886 ++ } else {
3887 ++ bfqd->RT_prod = R_rot * T_rot;
3888 ++ bfqd->peak_rate = R_rot;
3889 ++ }
3890 ++
3891 ++ return 0;
3892 ++}
3893 ++
3894 ++static void bfq_slab_kill(void)
3895 ++{
3896 ++ if (bfq_pool != NULL)
3897 ++ kmem_cache_destroy(bfq_pool);
3898 ++}
3899 ++
3900 ++static int __init bfq_slab_setup(void)
3901 ++{
3902 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
3903 ++ if (bfq_pool == NULL)
3904 ++ return -ENOMEM;
3905 ++ return 0;
3906 ++}
3907 ++
3908 ++static ssize_t bfq_var_show(unsigned int var, char *page)
3909 ++{
3910 ++ return sprintf(page, "%d\n", var);
3911 ++}
3912 ++
3913 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
3914 ++{
3915 ++ unsigned long new_val;
3916 ++ int ret = strict_strtoul(page, 10, &new_val);
3917 ++
3918 ++ if (ret == 0)
3919 ++ *var = new_val;
3920 ++
3921 ++ return count;
3922 ++}
3923 ++
3924 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
3925 ++{
3926 ++ struct bfq_data *bfqd = e->elevator_data;
3927 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
3928 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
3929 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
3930 ++}
3931 ++
3932 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
3933 ++{
3934 ++ struct bfq_queue *bfqq;
3935 ++ struct bfq_data *bfqd = e->elevator_data;
3936 ++ ssize_t num_char = 0;
3937 ++
3938 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
3939 ++ bfqd->queued);
3940 ++
3941 ++ spin_lock_irq(bfqd->queue->queue_lock);
3942 ++
3943 ++ num_char += sprintf(page + num_char, "Active:\n");
3944 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
3945 ++ num_char += sprintf(page + num_char,
3946 ++ "pid%d: weight %hu, nr_queued %d %d,"
3947 ++ " dur %d/%u\n",
3948 ++ bfqq->pid,
3949 ++ bfqq->entity.weight,
3950 ++ bfqq->queued[0],
3951 ++ bfqq->queued[1],
3952 ++ jiffies_to_msecs(jiffies -
3953 ++ bfqq->last_rais_start_finish),
3954 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
3955 ++ }
3956 ++
3957 ++ num_char += sprintf(page + num_char, "Idle:\n");
3958 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
3959 ++ num_char += sprintf(page + num_char,
3960 ++ "pid%d: weight %hu, dur %d/%u\n",
3961 ++ bfqq->pid,
3962 ++ bfqq->entity.weight,
3963 ++ jiffies_to_msecs(jiffies -
3964 ++ bfqq->last_rais_start_finish),
3965 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
3966 ++ }
3967 ++
3968 ++ spin_unlock_irq(bfqd->queue->queue_lock);
3969 ++
3970 ++ return num_char;
3971 ++}
3972 ++
3973 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3974 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3975 ++{ \
3976 ++ struct bfq_data *bfqd = e->elevator_data; \
3977 ++ unsigned int __data = __VAR; \
3978 ++ if (__CONV) \
3979 ++ __data = jiffies_to_msecs(__data); \
3980 ++ return bfq_var_show(__data, (page)); \
3981 ++}
3982 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
3983 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
3984 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
3985 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
3986 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
3987 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
3988 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
3989 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
3990 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
3991 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
3992 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
3993 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
3994 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
3995 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
3996 ++ 1);
3997 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
3998 ++ bfqd->bfq_raising_min_inter_arr_async,
3999 ++ 1);
4000 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4001 ++ bfqd->bfq_raising_max_softrt_rate, 0);
4002 ++#undef SHOW_FUNCTION
4003 ++
4004 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4005 ++static ssize_t \
4006 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4007 ++{ \
4008 ++ struct bfq_data *bfqd = e->elevator_data; \
4009 ++ unsigned long uninitialized_var(__data); \
4010 ++ int ret = bfq_var_store(&__data, (page), count); \
4011 ++ if (__data < (MIN)) \
4012 ++ __data = (MIN); \
4013 ++ else if (__data > (MAX)) \
4014 ++ __data = (MAX); \
4015 ++ if (__CONV) \
4016 ++ *(__PTR) = msecs_to_jiffies(__data); \
4017 ++ else \
4018 ++ *(__PTR) = __data; \
4019 ++ return ret; \
4020 ++}
4021 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4022 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4023 ++ INT_MAX, 1);
4024 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4025 ++ INT_MAX, 1);
4026 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4027 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4028 ++ INT_MAX, 0);
4029 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4030 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4031 ++ 1, INT_MAX, 0);
4032 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4033 ++ INT_MAX, 1);
4034 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4035 ++ INT_MAX, 0);
4036 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4037 ++ INT_MAX, 1);
4038 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4039 ++ INT_MAX, 1);
4040 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
4041 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4042 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4043 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4044 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4045 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4046 ++#undef STORE_FUNCTION
4047 ++
4048 ++/* do nothing for the moment */
4049 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
4050 ++ const char *page, size_t count)
4051 ++{
4052 ++ return count;
4053 ++}
4054 ++
4055 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4056 ++{
4057 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4058 ++
4059 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4060 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4061 ++ else
4062 ++ return bfq_default_max_budget;
4063 ++}
4064 ++
4065 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4066 ++ const char *page, size_t count)
4067 ++{
4068 ++ struct bfq_data *bfqd = e->elevator_data;
4069 ++ unsigned long uninitialized_var(__data);
4070 ++ int ret = bfq_var_store(&__data, (page), count);
4071 ++
4072 ++ if (__data == 0)
4073 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4074 ++ else {
4075 ++ if (__data > INT_MAX)
4076 ++ __data = INT_MAX;
4077 ++ bfqd->bfq_max_budget = __data;
4078 ++ }
4079 ++
4080 ++ bfqd->bfq_user_max_budget = __data;
4081 ++
4082 ++ return ret;
4083 ++}
4084 ++
4085 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4086 ++ const char *page, size_t count)
4087 ++{
4088 ++ struct bfq_data *bfqd = e->elevator_data;
4089 ++ unsigned long uninitialized_var(__data);
4090 ++ int ret = bfq_var_store(&__data, (page), count);
4091 ++
4092 ++ if (__data < 1)
4093 ++ __data = 1;
4094 ++ else if (__data > INT_MAX)
4095 ++ __data = INT_MAX;
4096 ++
4097 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4098 ++ if (bfqd->bfq_user_max_budget == 0)
4099 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4100 ++
4101 ++ return ret;
4102 ++}
4103 ++
4104 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4105 ++ const char *page, size_t count)
4106 ++{
4107 ++ struct bfq_data *bfqd = e->elevator_data;
4108 ++ unsigned long uninitialized_var(__data);
4109 ++ int ret = bfq_var_store(&__data, (page), count);
4110 ++
4111 ++ if (__data > 1)
4112 ++ __data = 1;
4113 ++ if (__data == 0 && bfqd->low_latency != 0)
4114 ++ bfq_end_raising(bfqd);
4115 ++ bfqd->low_latency = __data;
4116 ++
4117 ++ return ret;
4118 ++}
4119 ++
4120 ++#define BFQ_ATTR(name) \
4121 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4122 ++
4123 ++static struct elv_fs_entry bfq_attrs[] = {
4124 ++ BFQ_ATTR(quantum),
4125 ++ BFQ_ATTR(fifo_expire_sync),
4126 ++ BFQ_ATTR(fifo_expire_async),
4127 ++ BFQ_ATTR(back_seek_max),
4128 ++ BFQ_ATTR(back_seek_penalty),
4129 ++ BFQ_ATTR(slice_idle),
4130 ++ BFQ_ATTR(max_budget),
4131 ++ BFQ_ATTR(max_budget_async_rq),
4132 ++ BFQ_ATTR(timeout_sync),
4133 ++ BFQ_ATTR(timeout_async),
4134 ++ BFQ_ATTR(low_latency),
4135 ++ BFQ_ATTR(raising_coeff),
4136 ++ BFQ_ATTR(raising_max_time),
4137 ++ BFQ_ATTR(raising_rt_max_time),
4138 ++ BFQ_ATTR(raising_min_idle_time),
4139 ++ BFQ_ATTR(raising_min_inter_arr_async),
4140 ++ BFQ_ATTR(raising_max_softrt_rate),
4141 ++ BFQ_ATTR(weights),
4142 ++ __ATTR_NULL
4143 ++};
4144 ++
4145 ++static struct elevator_type iosched_bfq = {
4146 ++ .ops = {
4147 ++ .elevator_merge_fn = bfq_merge,
4148 ++ .elevator_merged_fn = bfq_merged_request,
4149 ++ .elevator_merge_req_fn = bfq_merged_requests,
4150 ++ .elevator_allow_merge_fn = bfq_allow_merge,
4151 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
4152 ++ .elevator_add_req_fn = bfq_insert_request,
4153 ++ .elevator_activate_req_fn = bfq_activate_request,
4154 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
4155 ++ .elevator_completed_req_fn = bfq_completed_request,
4156 ++ .elevator_former_req_fn = elv_rb_former_request,
4157 ++ .elevator_latter_req_fn = elv_rb_latter_request,
4158 ++ .elevator_init_icq_fn = bfq_init_icq,
4159 ++ .elevator_exit_icq_fn = bfq_exit_icq,
4160 ++ .elevator_set_req_fn = bfq_set_request,
4161 ++ .elevator_put_req_fn = bfq_put_request,
4162 ++ .elevator_may_queue_fn = bfq_may_queue,
4163 ++ .elevator_init_fn = bfq_init_queue,
4164 ++ .elevator_exit_fn = bfq_exit_queue,
4165 ++ },
4166 ++ .icq_size = sizeof(struct bfq_io_cq),
4167 ++ .icq_align = __alignof__(struct bfq_io_cq),
4168 ++ .elevator_attrs = bfq_attrs,
4169 ++ .elevator_name = "bfq",
4170 ++ .elevator_owner = THIS_MODULE,
4171 ++};
4172 ++
4173 ++static int __init bfq_init(void)
4174 ++{
4175 ++ /*
4176 ++ * Can be 0 on HZ < 1000 setups.
4177 ++ */
4178 ++ if (bfq_slice_idle == 0)
4179 ++ bfq_slice_idle = 1;
4180 ++
4181 ++ if (bfq_timeout_async == 0)
4182 ++ bfq_timeout_async = 1;
4183 ++
4184 ++ if (bfq_slab_setup())
4185 ++ return -ENOMEM;
4186 ++
4187 ++ elv_register(&iosched_bfq);
4188 ++
4189 ++ return 0;
4190 ++}
4191 ++
4192 ++static void __exit bfq_exit(void)
4193 ++{
4194 ++ elv_unregister(&iosched_bfq);
4195 ++ bfq_slab_kill();
4196 ++}
4197 ++
4198 ++module_init(bfq_init);
4199 ++module_exit(bfq_exit);
4200 ++
4201 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4202 ++MODULE_LICENSE("GPL");
4203 ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4204 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4205 +new file mode 100644
4206 +index 0000000..03f8061
4207 +--- /dev/null
4208 ++++ b/block/bfq-sched.c
4209 +@@ -0,0 +1,1072 @@
4210 ++/*
4211 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
4212 ++ *
4213 ++ * Based on ideas and code from CFQ:
4214 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4215 ++ *
4216 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4217 ++ * Paolo Valente <paolo.valente@×××××××.it>
4218 ++ *
4219 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4220 ++ */
4221 ++
4222 ++#ifdef CONFIG_CGROUP_BFQIO
4223 ++#define for_each_entity(entity) \
4224 ++ for (; entity != NULL; entity = entity->parent)
4225 ++
4226 ++#define for_each_entity_safe(entity, parent) \
4227 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4228 ++
4229 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4230 ++ int extract,
4231 ++ struct bfq_data *bfqd);
4232 ++
4233 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
4234 ++{
4235 ++ struct bfq_entity *bfqg_entity;
4236 ++ struct bfq_group *bfqg;
4237 ++ struct bfq_sched_data *group_sd;
4238 ++
4239 ++ BUG_ON(next_active == NULL);
4240 ++
4241 ++ group_sd = next_active->sched_data;
4242 ++
4243 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4244 ++ /*
4245 ++ * bfq_group's my_entity field is not NULL only if the group
4246 ++ * is not the root group. We must not touch the root entity
4247 ++ * as it must never become an active entity.
4248 ++ */
4249 ++ bfqg_entity = bfqg->my_entity;
4250 ++ if (bfqg_entity != NULL)
4251 ++ bfqg_entity->budget = next_active->budget;
4252 ++}
4253 ++
4254 ++static int bfq_update_next_active(struct bfq_sched_data *sd)
4255 ++{
4256 ++ struct bfq_entity *next_active;
4257 ++
4258 ++ if (sd->active_entity != NULL)
4259 ++ /* will update/requeue at the end of service */
4260 ++ return 0;
4261 ++
4262 ++ /*
4263 ++ * NOTE: this can be improved in many ways, such as returning
4264 ++ * 1 (and thus propagating upwards the update) only when the
4265 ++ * budget changes, or caching the bfqq that will be scheduled
4266 ++ * next from this subtree. By now we worry more about
4267 ++ * correctness than about performance...
4268 ++ */
4269 ++ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4270 ++ sd->next_active = next_active;
4271 ++
4272 ++ if (next_active != NULL)
4273 ++ bfq_update_budget(next_active);
4274 ++
4275 ++ return 1;
4276 ++}
4277 ++
4278 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4279 ++ struct bfq_entity *entity)
4280 ++{
4281 ++ BUG_ON(sd->next_active != entity);
4282 ++}
4283 ++#else
4284 ++#define for_each_entity(entity) \
4285 ++ for (; entity != NULL; entity = NULL)
4286 ++
4287 ++#define for_each_entity_safe(entity, parent) \
4288 ++ for (parent = NULL; entity != NULL; entity = parent)
4289 ++
4290 ++static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4291 ++{
4292 ++ return 0;
4293 ++}
4294 ++
4295 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4296 ++ struct bfq_entity *entity)
4297 ++{
4298 ++}
4299 ++
4300 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
4301 ++{
4302 ++}
4303 ++#endif
4304 ++
4305 ++/*
4306 ++ * Shift for timestamp calculations. This actually limits the maximum
4307 ++ * service allowed in one timestamp delta (small shift values increase it),
4308 ++ * the maximum total weight that can be used for the queues in the system
4309 ++ * (big shift values increase it), and the period of virtual time wraparounds.
4310 ++ */
4311 ++#define WFQ_SERVICE_SHIFT 22
4312 ++
4313 ++/**
4314 ++ * bfq_gt - compare two timestamps.
4315 ++ * @a: first ts.
4316 ++ * @b: second ts.
4317 ++ *
4318 ++ * Return @a > @b, dealing with wrapping correctly.
4319 ++ */
4320 ++static inline int bfq_gt(u64 a, u64 b)
4321 ++{
4322 ++ return (s64)(a - b) > 0;
4323 ++}
4324 ++
4325 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4326 ++{
4327 ++ struct bfq_queue *bfqq = NULL;
4328 ++
4329 ++ BUG_ON(entity == NULL);
4330 ++
4331 ++ if (entity->my_sched_data == NULL)
4332 ++ bfqq = container_of(entity, struct bfq_queue, entity);
4333 ++
4334 ++ return bfqq;
4335 ++}
4336 ++
4337 ++
4338 ++/**
4339 ++ * bfq_delta - map service into the virtual time domain.
4340 ++ * @service: amount of service.
4341 ++ * @weight: scale factor (weight of an entity or weight sum).
4342 ++ */
4343 ++static inline u64 bfq_delta(unsigned long service,
4344 ++ unsigned long weight)
4345 ++{
4346 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4347 ++
4348 ++ do_div(d, weight);
4349 ++ return d;
4350 ++}
4351 ++
4352 ++/**
4353 ++ * bfq_calc_finish - assign the finish time to an entity.
4354 ++ * @entity: the entity to act upon.
4355 ++ * @service: the service to be charged to the entity.
4356 ++ */
4357 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
4358 ++ unsigned long service)
4359 ++{
4360 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4361 ++
4362 ++ BUG_ON(entity->weight == 0);
4363 ++
4364 ++ entity->finish = entity->start +
4365 ++ bfq_delta(service, entity->weight);
4366 ++
4367 ++ if (bfqq != NULL) {
4368 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
4369 ++ "calc_finish: serv %lu, w %d",
4370 ++ service, entity->weight);
4371 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
4372 ++ "calc_finish: start %llu, finish %llu, delta %llu",
4373 ++ entity->start, entity->finish,
4374 ++ bfq_delta(service, entity->weight));
4375 ++ }
4376 ++}
4377 ++
4378 ++/**
4379 ++ * bfq_entity_of - get an entity from a node.
4380 ++ * @node: the node field of the entity.
4381 ++ *
4382 ++ * Convert a node pointer to the relative entity. This is used only
4383 ++ * to simplify the logic of some functions and not as the generic
4384 ++ * conversion mechanism because, e.g., in the tree walking functions,
4385 ++ * the check for a %NULL value would be redundant.
4386 ++ */
4387 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4388 ++{
4389 ++ struct bfq_entity *entity = NULL;
4390 ++
4391 ++ if (node != NULL)
4392 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
4393 ++
4394 ++ return entity;
4395 ++}
4396 ++
4397 ++/**
4398 ++ * bfq_extract - remove an entity from a tree.
4399 ++ * @root: the tree root.
4400 ++ * @entity: the entity to remove.
4401 ++ */
4402 ++static inline void bfq_extract(struct rb_root *root,
4403 ++ struct bfq_entity *entity)
4404 ++{
4405 ++ BUG_ON(entity->tree != root);
4406 ++
4407 ++ entity->tree = NULL;
4408 ++ rb_erase(&entity->rb_node, root);
4409 ++}
4410 ++
4411 ++/**
4412 ++ * bfq_idle_extract - extract an entity from the idle tree.
4413 ++ * @st: the service tree of the owning @entity.
4414 ++ * @entity: the entity being removed.
4415 ++ */
4416 ++static void bfq_idle_extract(struct bfq_service_tree *st,
4417 ++ struct bfq_entity *entity)
4418 ++{
4419 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4420 ++ struct rb_node *next;
4421 ++
4422 ++ BUG_ON(entity->tree != &st->idle);
4423 ++
4424 ++ if (entity == st->first_idle) {
4425 ++ next = rb_next(&entity->rb_node);
4426 ++ st->first_idle = bfq_entity_of(next);
4427 ++ }
4428 ++
4429 ++ if (entity == st->last_idle) {
4430 ++ next = rb_prev(&entity->rb_node);
4431 ++ st->last_idle = bfq_entity_of(next);
4432 ++ }
4433 ++
4434 ++ bfq_extract(&st->idle, entity);
4435 ++
4436 ++ if (bfqq != NULL)
4437 ++ list_del(&bfqq->bfqq_list);
4438 ++}
4439 ++
4440 ++/**
4441 ++ * bfq_insert - generic tree insertion.
4442 ++ * @root: tree root.
4443 ++ * @entity: entity to insert.
4444 ++ *
4445 ++ * This is used for the idle and the active tree, since they are both
4446 ++ * ordered by finish time.
4447 ++ */
4448 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4449 ++{
4450 ++ struct bfq_entity *entry;
4451 ++ struct rb_node **node = &root->rb_node;
4452 ++ struct rb_node *parent = NULL;
4453 ++
4454 ++ BUG_ON(entity->tree != NULL);
4455 ++
4456 ++ while (*node != NULL) {
4457 ++ parent = *node;
4458 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
4459 ++
4460 ++ if (bfq_gt(entry->finish, entity->finish))
4461 ++ node = &parent->rb_left;
4462 ++ else
4463 ++ node = &parent->rb_right;
4464 ++ }
4465 ++
4466 ++ rb_link_node(&entity->rb_node, parent, node);
4467 ++ rb_insert_color(&entity->rb_node, root);
4468 ++
4469 ++ entity->tree = root;
4470 ++}
4471 ++
4472 ++/**
4473 ++ * bfq_update_min - update the min_start field of a entity.
4474 ++ * @entity: the entity to update.
4475 ++ * @node: one of its children.
4476 ++ *
4477 ++ * This function is called when @entity may store an invalid value for
4478 ++ * min_start due to updates to the active tree. The function assumes
4479 ++ * that the subtree rooted at @node (which may be its left or its right
4480 ++ * child) has a valid min_start value.
4481 ++ */
4482 ++static inline void bfq_update_min(struct bfq_entity *entity,
4483 ++ struct rb_node *node)
4484 ++{
4485 ++ struct bfq_entity *child;
4486 ++
4487 ++ if (node != NULL) {
4488 ++ child = rb_entry(node, struct bfq_entity, rb_node);
4489 ++ if (bfq_gt(entity->min_start, child->min_start))
4490 ++ entity->min_start = child->min_start;
4491 ++ }
4492 ++}
4493 ++
4494 ++/**
4495 ++ * bfq_update_active_node - recalculate min_start.
4496 ++ * @node: the node to update.
4497 ++ *
4498 ++ * @node may have changed position or one of its children may have moved,
4499 ++ * this function updates its min_start value. The left and right subtrees
4500 ++ * are assumed to hold a correct min_start value.
4501 ++ */
4502 ++static inline void bfq_update_active_node(struct rb_node *node)
4503 ++{
4504 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4505 ++
4506 ++ entity->min_start = entity->start;
4507 ++ bfq_update_min(entity, node->rb_right);
4508 ++ bfq_update_min(entity, node->rb_left);
4509 ++}
4510 ++
4511 ++/**
4512 ++ * bfq_update_active_tree - update min_start for the whole active tree.
4513 ++ * @node: the starting node.
4514 ++ *
4515 ++ * @node must be the deepest modified node after an update. This function
4516 ++ * updates its min_start using the values held by its children, assuming
4517 ++ * that they did not change, and then updates all the nodes that may have
4518 ++ * changed in the path to the root. The only nodes that may have changed
4519 ++ * are the ones in the path or their siblings.
4520 ++ */
4521 ++static void bfq_update_active_tree(struct rb_node *node)
4522 ++{
4523 ++ struct rb_node *parent;
4524 ++
4525 ++up:
4526 ++ bfq_update_active_node(node);
4527 ++
4528 ++ parent = rb_parent(node);
4529 ++ if (parent == NULL)
4530 ++ return;
4531 ++
4532 ++ if (node == parent->rb_left && parent->rb_right != NULL)
4533 ++ bfq_update_active_node(parent->rb_right);
4534 ++ else if (parent->rb_left != NULL)
4535 ++ bfq_update_active_node(parent->rb_left);
4536 ++
4537 ++ node = parent;
4538 ++ goto up;
4539 ++}
4540 ++
4541 ++/**
4542 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
4543 ++ * @st: the service tree of the entity.
4544 ++ * @entity: the entity being inserted.
4545 ++ *
4546 ++ * The active tree is ordered by finish time, but an extra key is kept
4547 ++ * per each node, containing the minimum value for the start times of
4548 ++ * its children (and the node itself), so it's possible to search for
4549 ++ * the eligible node with the lowest finish time in logarithmic time.
4550 ++ */
4551 ++static void bfq_active_insert(struct bfq_service_tree *st,
4552 ++ struct bfq_entity *entity)
4553 ++{
4554 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4555 ++ struct rb_node *node = &entity->rb_node;
4556 ++
4557 ++ bfq_insert(&st->active, entity);
4558 ++
4559 ++ if (node->rb_left != NULL)
4560 ++ node = node->rb_left;
4561 ++ else if (node->rb_right != NULL)
4562 ++ node = node->rb_right;
4563 ++
4564 ++ bfq_update_active_tree(node);
4565 ++
4566 ++ if (bfqq != NULL)
4567 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4568 ++}
4569 ++
4570 ++/**
4571 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4572 ++ * @ioprio: the ioprio value to convert.
4573 ++ */
4574 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
4575 ++{
4576 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4577 ++ return IOPRIO_BE_NR - ioprio;
4578 ++}
4579 ++
4580 ++/**
4581 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4582 ++ * @weight: the weight value to convert.
4583 ++ *
4584 ++ * To preserve as mush as possible the old only-ioprio user interface,
4585 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
4586 ++ * larger than IOPRIO_BE_NR
4587 ++ */
4588 ++static unsigned short bfq_weight_to_ioprio(int weight)
4589 ++{
4590 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4591 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4592 ++}
4593 ++
4594 ++static inline void bfq_get_entity(struct bfq_entity *entity)
4595 ++{
4596 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4597 ++ struct bfq_sched_data *sd;
4598 ++
4599 ++ if (bfqq != NULL) {
4600 ++ sd = entity->sched_data;
4601 ++ atomic_inc(&bfqq->ref);
4602 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4603 ++ bfqq, atomic_read(&bfqq->ref));
4604 ++ }
4605 ++}
4606 ++
4607 ++/**
4608 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
4609 ++ * @node: the node being removed.
4610 ++ *
4611 ++ * Do the first step of an extraction in an rb tree, looking for the
4612 ++ * node that will replace @node, and returning the deepest node that
4613 ++ * the following modifications to the tree can touch. If @node is the
4614 ++ * last node in the tree return %NULL.
4615 ++ */
4616 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
4617 ++{
4618 ++ struct rb_node *deepest;
4619 ++
4620 ++ if (node->rb_right == NULL && node->rb_left == NULL)
4621 ++ deepest = rb_parent(node);
4622 ++ else if (node->rb_right == NULL)
4623 ++ deepest = node->rb_left;
4624 ++ else if (node->rb_left == NULL)
4625 ++ deepest = node->rb_right;
4626 ++ else {
4627 ++ deepest = rb_next(node);
4628 ++ if (deepest->rb_right != NULL)
4629 ++ deepest = deepest->rb_right;
4630 ++ else if (rb_parent(deepest) != node)
4631 ++ deepest = rb_parent(deepest);
4632 ++ }
4633 ++
4634 ++ return deepest;
4635 ++}
4636 ++
4637 ++/**
4638 ++ * bfq_active_extract - remove an entity from the active tree.
4639 ++ * @st: the service_tree containing the tree.
4640 ++ * @entity: the entity being removed.
4641 ++ */
4642 ++static void bfq_active_extract(struct bfq_service_tree *st,
4643 ++ struct bfq_entity *entity)
4644 ++{
4645 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4646 ++ struct rb_node *node;
4647 ++
4648 ++ node = bfq_find_deepest(&entity->rb_node);
4649 ++ bfq_extract(&st->active, entity);
4650 ++
4651 ++ if (node != NULL)
4652 ++ bfq_update_active_tree(node);
4653 ++
4654 ++ if (bfqq != NULL)
4655 ++ list_del(&bfqq->bfqq_list);
4656 ++}
4657 ++
4658 ++/**
4659 ++ * bfq_idle_insert - insert an entity into the idle tree.
4660 ++ * @st: the service tree containing the tree.
4661 ++ * @entity: the entity to insert.
4662 ++ */
4663 ++static void bfq_idle_insert(struct bfq_service_tree *st,
4664 ++ struct bfq_entity *entity)
4665 ++{
4666 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4667 ++ struct bfq_entity *first_idle = st->first_idle;
4668 ++ struct bfq_entity *last_idle = st->last_idle;
4669 ++
4670 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
4671 ++ st->first_idle = entity;
4672 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
4673 ++ st->last_idle = entity;
4674 ++
4675 ++ bfq_insert(&st->idle, entity);
4676 ++
4677 ++ if (bfqq != NULL)
4678 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
4679 ++}
4680 ++
4681 ++/**
4682 ++ * bfq_forget_entity - remove an entity from the wfq trees.
4683 ++ * @st: the service tree.
4684 ++ * @entity: the entity being removed.
4685 ++ *
4686 ++ * Update the device status and forget everything about @entity, putting
4687 ++ * the device reference to it, if it is a queue. Entities belonging to
4688 ++ * groups are not refcounted.
4689 ++ */
4690 ++static void bfq_forget_entity(struct bfq_service_tree *st,
4691 ++ struct bfq_entity *entity)
4692 ++{
4693 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4694 ++ struct bfq_sched_data *sd;
4695 ++
4696 ++ BUG_ON(!entity->on_st);
4697 ++
4698 ++ entity->on_st = 0;
4699 ++ st->wsum -= entity->weight;
4700 ++ if (bfqq != NULL) {
4701 ++ sd = entity->sched_data;
4702 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
4703 ++ bfqq, atomic_read(&bfqq->ref));
4704 ++ bfq_put_queue(bfqq);
4705 ++ }
4706 ++}
4707 ++
4708 ++/**
4709 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
4710 ++ * @st: service tree for the entity.
4711 ++ * @entity: the entity being released.
4712 ++ */
4713 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
4714 ++ struct bfq_entity *entity)
4715 ++{
4716 ++ bfq_idle_extract(st, entity);
4717 ++ bfq_forget_entity(st, entity);
4718 ++}
4719 ++
4720 ++/**
4721 ++ * bfq_forget_idle - update the idle tree if necessary.
4722 ++ * @st: the service tree to act upon.
4723 ++ *
4724 ++ * To preserve the global O(log N) complexity we only remove one entry here;
4725 ++ * as the idle tree will not grow indefinitely this can be done safely.
4726 ++ */
4727 ++static void bfq_forget_idle(struct bfq_service_tree *st)
4728 ++{
4729 ++ struct bfq_entity *first_idle = st->first_idle;
4730 ++ struct bfq_entity *last_idle = st->last_idle;
4731 ++
4732 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
4733 ++ !bfq_gt(last_idle->finish, st->vtime)) {
4734 ++ /*
4735 ++ * Forget the whole idle tree, increasing the vtime past
4736 ++ * the last finish time of idle entities.
4737 ++ */
4738 ++ st->vtime = last_idle->finish;
4739 ++ }
4740 ++
4741 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
4742 ++ bfq_put_idle_entity(st, first_idle);
4743 ++}
4744 ++
4745 ++static struct bfq_service_tree *
4746 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
4747 ++ struct bfq_entity *entity)
4748 ++{
4749 ++ struct bfq_service_tree *new_st = old_st;
4750 ++
4751 ++ if (entity->ioprio_changed) {
4752 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4753 ++
4754 ++ BUG_ON(old_st->wsum < entity->weight);
4755 ++ old_st->wsum -= entity->weight;
4756 ++
4757 ++ if (entity->new_weight != entity->orig_weight) {
4758 ++ entity->orig_weight = entity->new_weight;
4759 ++ entity->ioprio =
4760 ++ bfq_weight_to_ioprio(entity->orig_weight);
4761 ++ } else if (entity->new_ioprio != entity->ioprio) {
4762 ++ entity->ioprio = entity->new_ioprio;
4763 ++ entity->orig_weight =
4764 ++ bfq_ioprio_to_weight(entity->ioprio);
4765 ++ } else
4766 ++ entity->new_weight = entity->orig_weight =
4767 ++ bfq_ioprio_to_weight(entity->ioprio);
4768 ++
4769 ++ entity->ioprio_class = entity->new_ioprio_class;
4770 ++ entity->ioprio_changed = 0;
4771 ++
4772 ++ /*
4773 ++ * NOTE: here we may be changing the weight too early,
4774 ++ * this will cause unfairness. The correct approach
4775 ++ * would have required additional complexity to defer
4776 ++ * weight changes to the proper time instants (i.e.,
4777 ++ * when entity->finish <= old_st->vtime).
4778 ++ */
4779 ++ new_st = bfq_entity_service_tree(entity);
4780 ++ entity->weight = entity->orig_weight *
4781 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
4782 ++ new_st->wsum += entity->weight;
4783 ++
4784 ++ if (new_st != old_st)
4785 ++ entity->start = new_st->vtime;
4786 ++ }
4787 ++
4788 ++ return new_st;
4789 ++}
4790 ++
4791 ++/**
4792 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
4793 ++ * @bfqq: the queue being served.
4794 ++ * @served: bytes to transfer.
4795 ++ *
4796 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
4797 ++ * are synchronized every time a new bfqq is selected for service. By now,
4798 ++ * we keep it to better check consistency.
4799 ++ */
4800 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
4801 ++{
4802 ++ struct bfq_entity *entity = &bfqq->entity;
4803 ++ struct bfq_service_tree *st;
4804 ++
4805 ++ for_each_entity(entity) {
4806 ++ st = bfq_entity_service_tree(entity);
4807 ++
4808 ++ entity->service += served;
4809 ++ BUG_ON(entity->service > entity->budget);
4810 ++ BUG_ON(st->wsum == 0);
4811 ++
4812 ++ st->vtime += bfq_delta(served, st->wsum);
4813 ++ bfq_forget_idle(st);
4814 ++ }
4815 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
4816 ++}
4817 ++
4818 ++/**
4819 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
4820 ++ * @bfqq: the queue that needs a service update.
4821 ++ *
4822 ++ * When it's not possible to be fair in the service domain, because
4823 ++ * a queue is not consuming its budget fast enough (the meaning of
4824 ++ * fast depends on the timeout parameter), we charge it a full
4825 ++ * budget. In this way we should obtain a sort of time-domain
4826 ++ * fairness among all the seeky/slow queues.
4827 ++ */
4828 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
4829 ++{
4830 ++ struct bfq_entity *entity = &bfqq->entity;
4831 ++
4832 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
4833 ++
4834 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
4835 ++}
4836 ++
4837 ++/**
4838 ++ * __bfq_activate_entity - activate an entity.
4839 ++ * @entity: the entity being activated.
4840 ++ *
4841 ++ * Called whenever an entity is activated, i.e., it is not active and one
4842 ++ * of its children receives a new request, or has to be reactivated due to
4843 ++ * budget exhaustion. It uses the current budget of the entity (and the
4844 ++ * service received if @entity is active) of the queue to calculate its
4845 ++ * timestamps.
4846 ++ */
4847 ++static void __bfq_activate_entity(struct bfq_entity *entity)
4848 ++{
4849 ++ struct bfq_sched_data *sd = entity->sched_data;
4850 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4851 ++
4852 ++ if (entity == sd->active_entity) {
4853 ++ BUG_ON(entity->tree != NULL);
4854 ++ /*
4855 ++ * If we are requeueing the current entity we have
4856 ++ * to take care of not charging to it service it has
4857 ++ * not received.
4858 ++ */
4859 ++ bfq_calc_finish(entity, entity->service);
4860 ++ entity->start = entity->finish;
4861 ++ sd->active_entity = NULL;
4862 ++ } else if (entity->tree == &st->active) {
4863 ++ /*
4864 ++ * Requeueing an entity due to a change of some
4865 ++ * next_active entity below it. We reuse the old
4866 ++ * start time.
4867 ++ */
4868 ++ bfq_active_extract(st, entity);
4869 ++ } else if (entity->tree == &st->idle) {
4870 ++ /*
4871 ++ * Must be on the idle tree, bfq_idle_extract() will
4872 ++ * check for that.
4873 ++ */
4874 ++ bfq_idle_extract(st, entity);
4875 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
4876 ++ st->vtime : entity->finish;
4877 ++ } else {
4878 ++ /*
4879 ++ * The finish time of the entity may be invalid, and
4880 ++ * it is in the past for sure, otherwise the queue
4881 ++ * would have been on the idle tree.
4882 ++ */
4883 ++ entity->start = st->vtime;
4884 ++ st->wsum += entity->weight;
4885 ++ bfq_get_entity(entity);
4886 ++
4887 ++ BUG_ON(entity->on_st);
4888 ++ entity->on_st = 1;
4889 ++ }
4890 ++
4891 ++ st = __bfq_entity_update_weight_prio(st, entity);
4892 ++ bfq_calc_finish(entity, entity->budget);
4893 ++ bfq_active_insert(st, entity);
4894 ++}
4895 ++
4896 ++/**
4897 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
4898 ++ * @entity: the entity to activate.
4899 ++ *
4900 ++ * Activate @entity and all the entities on the path from it to the root.
4901 ++ */
4902 ++static void bfq_activate_entity(struct bfq_entity *entity)
4903 ++{
4904 ++ struct bfq_sched_data *sd;
4905 ++
4906 ++ for_each_entity(entity) {
4907 ++ __bfq_activate_entity(entity);
4908 ++
4909 ++ sd = entity->sched_data;
4910 ++ if (!bfq_update_next_active(sd))
4911 ++ /*
4912 ++ * No need to propagate the activation to the
4913 ++ * upper entities, as they will be updated when
4914 ++ * the active entity is rescheduled.
4915 ++ */
4916 ++ break;
4917 ++ }
4918 ++}
4919 ++
4920 ++/**
4921 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
4922 ++ * @entity: the entity to deactivate.
4923 ++ * @requeue: if false, the entity will not be put into the idle tree.
4924 ++ *
4925 ++ * Deactivate an entity, independently from its previous state. If the
4926 ++ * entity was not on a service tree just return, otherwise if it is on
4927 ++ * any scheduler tree, extract it from that tree, and if necessary
4928 ++ * and if the caller did not specify @requeue, put it on the idle tree.
4929 ++ *
4930 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
4931 ++ * if the entity was under service or if it was the next_active for
4932 ++ * its sched_data; return %0 otherwise.
4933 ++ */
4934 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4935 ++{
4936 ++ struct bfq_sched_data *sd = entity->sched_data;
4937 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4938 ++ int was_active = entity == sd->active_entity;
4939 ++ int ret = 0;
4940 ++
4941 ++ if (!entity->on_st)
4942 ++ return 0;
4943 ++
4944 ++ BUG_ON(was_active && entity->tree != NULL);
4945 ++
4946 ++ if (was_active) {
4947 ++ bfq_calc_finish(entity, entity->service);
4948 ++ sd->active_entity = NULL;
4949 ++ } else if (entity->tree == &st->active)
4950 ++ bfq_active_extract(st, entity);
4951 ++ else if (entity->tree == &st->idle)
4952 ++ bfq_idle_extract(st, entity);
4953 ++ else if (entity->tree != NULL)
4954 ++ BUG();
4955 ++
4956 ++ if (was_active || sd->next_active == entity)
4957 ++ ret = bfq_update_next_active(sd);
4958 ++
4959 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
4960 ++ bfq_forget_entity(st, entity);
4961 ++ else
4962 ++ bfq_idle_insert(st, entity);
4963 ++
4964 ++ BUG_ON(sd->active_entity == entity);
4965 ++ BUG_ON(sd->next_active == entity);
4966 ++
4967 ++ return ret;
4968 ++}
4969 ++
4970 ++/**
4971 ++ * bfq_deactivate_entity - deactivate an entity.
4972 ++ * @entity: the entity to deactivate.
4973 ++ * @requeue: true if the entity can be put on the idle tree
4974 ++ */
4975 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4976 ++{
4977 ++ struct bfq_sched_data *sd;
4978 ++ struct bfq_entity *parent;
4979 ++
4980 ++ for_each_entity_safe(entity, parent) {
4981 ++ sd = entity->sched_data;
4982 ++
4983 ++ if (!__bfq_deactivate_entity(entity, requeue))
4984 ++ /*
4985 ++ * The parent entity is still backlogged, and
4986 ++ * we don't need to update it as it is still
4987 ++ * under service.
4988 ++ */
4989 ++ break;
4990 ++
4991 ++ if (sd->next_active != NULL)
4992 ++ /*
4993 ++ * The parent entity is still backlogged and
4994 ++ * the budgets on the path towards the root
4995 ++ * need to be updated.
4996 ++ */
4997 ++ goto update;
4998 ++
4999 ++ /*
5000 ++ * If we reach there the parent is no more backlogged and
5001 ++ * we want to propagate the dequeue upwards.
5002 ++ */
5003 ++ requeue = 1;
5004 ++ }
5005 ++
5006 ++ return;
5007 ++
5008 ++update:
5009 ++ entity = parent;
5010 ++ for_each_entity(entity) {
5011 ++ __bfq_activate_entity(entity);
5012 ++
5013 ++ sd = entity->sched_data;
5014 ++ if (!bfq_update_next_active(sd))
5015 ++ break;
5016 ++ }
5017 ++}
5018 ++
5019 ++/**
5020 ++ * bfq_update_vtime - update vtime if necessary.
5021 ++ * @st: the service tree to act upon.
5022 ++ *
5023 ++ * If necessary update the service tree vtime to have at least one
5024 ++ * eligible entity, skipping to its start time. Assumes that the
5025 ++ * active tree of the device is not empty.
5026 ++ *
5027 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
5028 ++ * we may end up with reactivated tasks getting timestamps after a
5029 ++ * vtime skip done because we needed a ->first_active entity on some
5030 ++ * intermediate node.
5031 ++ */
5032 ++static void bfq_update_vtime(struct bfq_service_tree *st)
5033 ++{
5034 ++ struct bfq_entity *entry;
5035 ++ struct rb_node *node = st->active.rb_node;
5036 ++
5037 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
5038 ++ if (bfq_gt(entry->min_start, st->vtime)) {
5039 ++ st->vtime = entry->min_start;
5040 ++ bfq_forget_idle(st);
5041 ++ }
5042 ++}
5043 ++
5044 ++/**
5045 ++ * bfq_first_active - find the eligible entity with the smallest finish time
5046 ++ * @st: the service tree to select from.
5047 ++ *
5048 ++ * This function searches the first schedulable entity, starting from the
5049 ++ * root of the tree and going on the left every time on this side there is
5050 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
5051 ++ * on the right is followed only if a) the left subtree contains no eligible
5052 ++ * entities and b) no eligible entity has been found yet.
5053 ++ */
5054 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5055 ++{
5056 ++ struct bfq_entity *entry, *first = NULL;
5057 ++ struct rb_node *node = st->active.rb_node;
5058 ++
5059 ++ while (node != NULL) {
5060 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
5061 ++left:
5062 ++ if (!bfq_gt(entry->start, st->vtime))
5063 ++ first = entry;
5064 ++
5065 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5066 ++
5067 ++ if (node->rb_left != NULL) {
5068 ++ entry = rb_entry(node->rb_left,
5069 ++ struct bfq_entity, rb_node);
5070 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
5071 ++ node = node->rb_left;
5072 ++ goto left;
5073 ++ }
5074 ++ }
5075 ++ if (first != NULL)
5076 ++ break;
5077 ++ node = node->rb_right;
5078 ++ }
5079 ++
5080 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5081 ++ return first;
5082 ++}
5083 ++
5084 ++/**
5085 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5086 ++ * @st: the service tree.
5087 ++ *
5088 ++ * Update the virtual time in @st and return the first eligible entity
5089 ++ * it contains.
5090 ++ */
5091 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5092 ++ bool force)
5093 ++{
5094 ++ struct bfq_entity *entity, *new_next_active = NULL;
5095 ++
5096 ++ if (RB_EMPTY_ROOT(&st->active))
5097 ++ return NULL;
5098 ++
5099 ++ bfq_update_vtime(st);
5100 ++ entity = bfq_first_active_entity(st);
5101 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
5102 ++
5103 ++ /*
5104 ++ * If the chosen entity does not match with the sched_data's
5105 ++ * next_active and we are forcedly serving the IDLE priority
5106 ++ * class tree, bubble up budget update.
5107 ++ */
5108 ++ if (unlikely(force && entity != entity->sched_data->next_active)) {
5109 ++ new_next_active = entity;
5110 ++ for_each_entity(new_next_active)
5111 ++ bfq_update_budget(new_next_active);
5112 ++ }
5113 ++
5114 ++ return entity;
5115 ++}
5116 ++
5117 ++/**
5118 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5119 ++ * @sd: the sched_data.
5120 ++ * @extract: if true the returned entity will be also extracted from @sd.
5121 ++ *
5122 ++ * NOTE: since we cache the next_active entity at each level of the
5123 ++ * hierarchy, the complexity of the lookup can be decreased with
5124 ++ * absolutely no effort just returning the cached next_active value;
5125 ++ * we prefer to do full lookups to test the consistency of * the data
5126 ++ * structures.
5127 ++ */
5128 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5129 ++ int extract,
5130 ++ struct bfq_data *bfqd)
5131 ++{
5132 ++ struct bfq_service_tree *st = sd->service_tree;
5133 ++ struct bfq_entity *entity;
5134 ++ int i=0;
5135 ++
5136 ++ BUG_ON(sd->active_entity != NULL);
5137 ++
5138 ++ if (bfqd != NULL &&
5139 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5140 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
5141 ++ if (entity != NULL) {
5142 ++ i = BFQ_IOPRIO_CLASSES - 1;
5143 ++ bfqd->bfq_class_idle_last_service = jiffies;
5144 ++ sd->next_active = entity;
5145 ++ }
5146 ++ }
5147 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5148 ++ entity = __bfq_lookup_next_entity(st + i, false);
5149 ++ if (entity != NULL) {
5150 ++ if (extract) {
5151 ++ bfq_check_next_active(sd, entity);
5152 ++ bfq_active_extract(st + i, entity);
5153 ++ sd->active_entity = entity;
5154 ++ sd->next_active = NULL;
5155 ++ }
5156 ++ break;
5157 ++ }
5158 ++ }
5159 ++
5160 ++ return entity;
5161 ++}
5162 ++
5163 ++/*
5164 ++ * Get next queue for service.
5165 ++ */
5166 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5167 ++{
5168 ++ struct bfq_entity *entity = NULL;
5169 ++ struct bfq_sched_data *sd;
5170 ++ struct bfq_queue *bfqq;
5171 ++
5172 ++ BUG_ON(bfqd->active_queue != NULL);
5173 ++
5174 ++ if (bfqd->busy_queues == 0)
5175 ++ return NULL;
5176 ++
5177 ++ sd = &bfqd->root_group->sched_data;
5178 ++ for (; sd != NULL; sd = entity->my_sched_data) {
5179 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5180 ++ BUG_ON(entity == NULL);
5181 ++ entity->service = 0;
5182 ++ }
5183 ++
5184 ++ bfqq = bfq_entity_to_bfqq(entity);
5185 ++ BUG_ON(bfqq == NULL);
5186 ++
5187 ++ return bfqq;
5188 ++}
5189 ++
5190 ++/*
5191 ++ * Forced extraction of the given queue.
5192 ++ */
5193 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5194 ++ struct bfq_queue *bfqq)
5195 ++{
5196 ++ struct bfq_entity *entity;
5197 ++ struct bfq_sched_data *sd;
5198 ++
5199 ++ BUG_ON(bfqd->active_queue != NULL);
5200 ++
5201 ++ entity = &bfqq->entity;
5202 ++ /*
5203 ++ * Bubble up extraction/update from the leaf to the root.
5204 ++ */
5205 ++ for_each_entity(entity) {
5206 ++ sd = entity->sched_data;
5207 ++ bfq_update_budget(entity);
5208 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
5209 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5210 ++ sd->active_entity = entity;
5211 ++ sd->next_active = NULL;
5212 ++ entity->service = 0;
5213 ++ }
5214 ++
5215 ++ return;
5216 ++}
5217 ++
5218 ++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
5219 ++{
5220 ++ if (bfqd->active_bic != NULL) {
5221 ++ put_io_context(bfqd->active_bic->icq.ioc);
5222 ++ bfqd->active_bic = NULL;
5223 ++ }
5224 ++
5225 ++ bfqd->active_queue = NULL;
5226 ++ del_timer(&bfqd->idle_slice_timer);
5227 ++}
5228 ++
5229 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5230 ++ int requeue)
5231 ++{
5232 ++ struct bfq_entity *entity = &bfqq->entity;
5233 ++
5234 ++ if (bfqq == bfqd->active_queue)
5235 ++ __bfq_bfqd_reset_active(bfqd);
5236 ++
5237 ++ bfq_deactivate_entity(entity, requeue);
5238 ++}
5239 ++
5240 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5241 ++{
5242 ++ struct bfq_entity *entity = &bfqq->entity;
5243 ++
5244 ++ bfq_activate_entity(entity);
5245 ++}
5246 ++
5247 ++/*
5248 ++ * Called when the bfqq no longer has requests pending, remove it from
5249 ++ * the service tree.
5250 ++ */
5251 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5252 ++ int requeue)
5253 ++{
5254 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
5255 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5256 ++
5257 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5258 ++
5259 ++ bfq_clear_bfqq_busy(bfqq);
5260 ++
5261 ++ BUG_ON(bfqd->busy_queues == 0);
5262 ++ bfqd->busy_queues--;
5263 ++
5264 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5265 ++}
5266 ++
5267 ++/*
5268 ++ * Called when an inactive queue receives a new request.
5269 ++ */
5270 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5271 ++{
5272 ++ BUG_ON(bfq_bfqq_busy(bfqq));
5273 ++ BUG_ON(bfqq == bfqd->active_queue);
5274 ++
5275 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5276 ++
5277 ++ bfq_activate_bfqq(bfqd, bfqq);
5278 ++
5279 ++ bfq_mark_bfqq_busy(bfqq);
5280 ++ bfqd->busy_queues++;
5281 ++}
5282 +diff --git a/block/bfq.h b/block/bfq.h
5283 +new file mode 100644
5284 +index 0000000..b4e9ab0
5285 +--- /dev/null
5286 ++++ b/block/bfq.h
5287 +@@ -0,0 +1,603 @@
5288 ++/*
5289 ++ * BFQ-v6r2 for 3.9.0: data structures and common functions prototypes.
5290 ++ *
5291 ++ * Based on ideas and code from CFQ:
5292 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5293 ++ *
5294 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5295 ++ * Paolo Valente <paolo.valente@×××××××.it>
5296 ++ *
5297 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5298 ++ */
5299 ++
5300 ++#ifndef _BFQ_H
5301 ++#define _BFQ_H
5302 ++
5303 ++#include <linux/blktrace_api.h>
5304 ++#include <linux/hrtimer.h>
5305 ++#include <linux/ioprio.h>
5306 ++#include <linux/rbtree.h>
5307 ++
5308 ++#define BFQ_IOPRIO_CLASSES 3
5309 ++#define BFQ_CL_IDLE_TIMEOUT HZ/5
5310 ++
5311 ++#define BFQ_MIN_WEIGHT 1
5312 ++#define BFQ_MAX_WEIGHT 1000
5313 ++
5314 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
5315 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
5316 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5317 ++
5318 ++struct bfq_entity;
5319 ++
5320 ++/**
5321 ++ * struct bfq_service_tree - per ioprio_class service tree.
5322 ++ * @active: tree for active entities (i.e., those backlogged).
5323 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5324 ++ * @first_idle: idle entity with minimum F_i.
5325 ++ * @last_idle: idle entity with maximum F_i.
5326 ++ * @vtime: scheduler virtual time.
5327 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5328 ++ *
5329 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5330 ++ * ioprio_class has its own independent scheduler, and so its own
5331 ++ * bfq_service_tree. All the fields are protected by the queue lock
5332 ++ * of the containing bfqd.
5333 ++ */
5334 ++struct bfq_service_tree {
5335 ++ struct rb_root active;
5336 ++ struct rb_root idle;
5337 ++
5338 ++ struct bfq_entity *first_idle;
5339 ++ struct bfq_entity *last_idle;
5340 ++
5341 ++ u64 vtime;
5342 ++ unsigned long wsum;
5343 ++};
5344 ++
5345 ++/**
5346 ++ * struct bfq_sched_data - multi-class scheduler.
5347 ++ * @active_entity: entity under service.
5348 ++ * @next_active: head-of-the-line entity in the scheduler.
5349 ++ * @service_tree: array of service trees, one per ioprio_class.
5350 ++ *
5351 ++ * bfq_sched_data is the basic scheduler queue. It supports three
5352 ++ * ioprio_classes, and can be used either as a toplevel queue or as
5353 ++ * an intermediate queue on a hierarchical setup.
5354 ++ * @next_active points to the active entity of the sched_data service
5355 ++ * trees that will be scheduled next.
5356 ++ *
5357 ++ * The supported ioprio_classes are the same as in CFQ, in descending
5358 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5359 ++ * Requests from higher priority queues are served before all the
5360 ++ * requests from lower priority queues; among requests of the same
5361 ++ * queue requests are served according to B-WF2Q+.
5362 ++ * All the fields are protected by the queue lock of the containing bfqd.
5363 ++ */
5364 ++struct bfq_sched_data {
5365 ++ struct bfq_entity *active_entity;
5366 ++ struct bfq_entity *next_active;
5367 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5368 ++};
5369 ++
5370 ++/**
5371 ++ * struct bfq_entity - schedulable entity.
5372 ++ * @rb_node: service_tree member.
5373 ++ * @on_st: flag, true if the entity is on a tree (either the active or
5374 ++ * the idle one of its service_tree).
5375 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5376 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
5377 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5378 ++ * @min_start: minimum start time of the (active) subtree rooted at
5379 ++ * this entity; used for O(log N) lookups into active trees.
5380 ++ * @service: service received during the last round of service.
5381 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5382 ++ * @weight: weight of the queue
5383 ++ * @parent: parent entity, for hierarchical scheduling.
5384 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5385 ++ * associated scheduler queue, %NULL on leaf nodes.
5386 ++ * @sched_data: the scheduler queue this entity belongs to.
5387 ++ * @ioprio: the ioprio in use.
5388 ++ * @new_weight: when a weight change is requested, the new weight value.
5389 ++ * @orig_weight: original weight, used to implement weight boosting
5390 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5391 ++ * @ioprio_class: the ioprio_class in use.
5392 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
5393 ++ * ioprio_class value.
5394 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5395 ++ * ioprio_class change.
5396 ++ *
5397 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5398 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5399 ++ * entity belongs to the sched_data of the parent group in the cgroup
5400 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
5401 ++ * in @my_sched_data.
5402 ++ *
5403 ++ * Each entity stores independently its priority values; this would
5404 ++ * allow different weights on different devices, but this
5405 ++ * functionality is not exported to userspace by now. Priorities and
5406 ++ * weights are updated lazily, first storing the new values into the
5407 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
5408 ++ * there is a transition in the entity state that allows the priority
5409 ++ * update to take place the effective and the requested priority
5410 ++ * values are synchronized.
5411 ++ *
5412 ++ * Unless cgroups are used, the weight value is calculated from the
5413 ++ * ioprio to export the same interface as CFQ. When dealing with
5414 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
5415 ++ * time to consume their budget and have true sequential behavior, and
5416 ++ * when there are no external factors breaking anticipation) the
5417 ++ * relative weights at each level of the cgroups hierarchy should be
5418 ++ * guaranteed. All the fields are protected by the queue lock of the
5419 ++ * containing bfqd.
5420 ++ */
5421 ++struct bfq_entity {
5422 ++ struct rb_node rb_node;
5423 ++
5424 ++ int on_st;
5425 ++
5426 ++ u64 finish;
5427 ++ u64 start;
5428 ++
5429 ++ struct rb_root *tree;
5430 ++
5431 ++ u64 min_start;
5432 ++
5433 ++ unsigned long service, budget;
5434 ++ unsigned short weight, new_weight;
5435 ++ unsigned short orig_weight;
5436 ++
5437 ++ struct bfq_entity *parent;
5438 ++
5439 ++ struct bfq_sched_data *my_sched_data;
5440 ++ struct bfq_sched_data *sched_data;
5441 ++
5442 ++ unsigned short ioprio, new_ioprio;
5443 ++ unsigned short ioprio_class, new_ioprio_class;
5444 ++
5445 ++ int ioprio_changed;
5446 ++};
5447 ++
5448 ++struct bfq_group;
5449 ++
5450 ++/**
5451 ++ * struct bfq_queue - leaf schedulable entity.
5452 ++ * @ref: reference counter.
5453 ++ * @bfqd: parent bfq_data.
5454 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
5455 ++ * one or more other queues.
5456 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5457 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5458 ++ * @sort_list: sorted list of pending requests.
5459 ++ * @next_rq: if fifo isn't expired, next request to serve.
5460 ++ * @queued: nr of requests queued in @sort_list.
5461 ++ * @allocated: currently allocated requests.
5462 ++ * @meta_pending: pending metadata requests.
5463 ++ * @fifo: fifo list of requests in sort_list.
5464 ++ * @entity: entity representing this queue in the scheduler.
5465 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
5466 ++ * @budget_timeout: budget expiration (in jiffies).
5467 ++ * @dispatched: number of requests on the dispatch list or inside driver.
5468 ++ * @org_ioprio: saved ioprio during boosted periods.
5469 ++ * @flags: status flags.
5470 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5471 ++ * @seek_samples: number of seeks sampled
5472 ++ * @seek_total: sum of the distances of the seeks sampled
5473 ++ * @seek_mean: mean seek distance
5474 ++ * @last_request_pos: position of the last request enqueued
5475 ++ * @pid: pid of the process owning the queue, used for logging purposes.
5476 ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5477 ++ * @raising_cur_max_time: current max raising time for this queue
5478 ++ *
5479 ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5480 ++ * or more (if it is an async one). @cgroup holds a reference to the
5481 ++ * cgroup, to be sure that it does not disappear while a bfqq still
5482 ++ * references it (mostly to avoid races between request issuing and task
5483 ++ * migration followed by cgroup distruction).
5484 ++ * All the fields are protected by the queue lock of the containing bfqd.
5485 ++ */
5486 ++struct bfq_queue {
5487 ++ atomic_t ref;
5488 ++ struct bfq_data *bfqd;
5489 ++
5490 ++ /* fields for cooperating queues handling */
5491 ++ struct bfq_queue *new_bfqq;
5492 ++ struct rb_node pos_node;
5493 ++ struct rb_root *pos_root;
5494 ++
5495 ++ struct rb_root sort_list;
5496 ++ struct request *next_rq;
5497 ++ int queued[2];
5498 ++ int allocated[2];
5499 ++ int meta_pending;
5500 ++ struct list_head fifo;
5501 ++
5502 ++ struct bfq_entity entity;
5503 ++
5504 ++ unsigned long max_budget;
5505 ++ unsigned long budget_timeout;
5506 ++
5507 ++ int dispatched;
5508 ++
5509 ++ unsigned short org_ioprio;
5510 ++
5511 ++ unsigned int flags;
5512 ++
5513 ++ struct list_head bfqq_list;
5514 ++
5515 ++ unsigned int seek_samples;
5516 ++ u64 seek_total;
5517 ++ sector_t seek_mean;
5518 ++ sector_t last_request_pos;
5519 ++
5520 ++ pid_t pid;
5521 ++
5522 ++ /* weight-raising fields */
5523 ++ unsigned int raising_cur_max_time;
5524 ++ u64 last_rais_start_finish, soft_rt_next_start;
5525 ++ unsigned int raising_coeff;
5526 ++};
5527 ++
5528 ++/**
5529 ++ * struct bfq_ttime - per process thinktime stats.
5530 ++ * @ttime_total: total process thinktime
5531 ++ * @ttime_samples: number of thinktime samples
5532 ++ * @ttime_mean: average process thinktime
5533 ++ */
5534 ++struct bfq_ttime {
5535 ++ unsigned long last_end_request;
5536 ++
5537 ++ unsigned long ttime_total;
5538 ++ unsigned long ttime_samples;
5539 ++ unsigned long ttime_mean;
5540 ++};
5541 ++
5542 ++/**
5543 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
5544 ++ * @icq: associated io_cq structure
5545 ++ * @bfqq: array of two process queues, the sync and the async
5546 ++ * @ttime: associated @bfq_ttime struct
5547 ++ */
5548 ++struct bfq_io_cq {
5549 ++ struct io_cq icq; /* must be the first member */
5550 ++ struct bfq_queue *bfqq[2];
5551 ++ struct bfq_ttime ttime;
5552 ++ int ioprio;
5553 ++};
5554 ++
5555 ++/**
5556 ++ * struct bfq_data - per device data structure.
5557 ++ * @queue: request queue for the managed device.
5558 ++ * @root_group: root bfq_group for the device.
5559 ++ * @rq_pos_tree: rbtree sorted by next_request position,
5560 ++ * used when determining if two or more queues
5561 ++ * have interleaving requests (see bfq_close_cooperator).
5562 ++ * @busy_queues: number of bfq_queues containing requests (including the
5563 ++ * queue under service, even if it is idling).
5564 ++ * @queued: number of queued requests.
5565 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
5566 ++ * @sync_flight: number of sync requests in the driver.
5567 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5568 ++ * completed requests .
5569 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5570 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5571 ++ * @budgets_assigned: number of budgets assigned.
5572 ++ * @idle_slice_timer: timer set when idling for the next sequential request
5573 ++ * from the queue under service.
5574 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
5575 ++ * @active_queue: bfq_queue under service.
5576 ++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
5577 ++ * @last_position: on-disk position of the last served request.
5578 ++ * @last_budget_start: beginning of the last budget.
5579 ++ * @last_idling_start: beginning of the last idle slice.
5580 ++ * @peak_rate: peak transfer rate observed for a budget.
5581 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5582 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5583 ++ * @group_list: list of all the bfq_groups active on the device.
5584 ++ * @active_list: list of all the bfq_queues active on the device.
5585 ++ * @idle_list: list of all the bfq_queues idle on the device.
5586 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
5587 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5588 ++ * requests are served in fifo order.
5589 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5590 ++ * @bfq_back_max: maximum allowed backward seek.
5591 ++ * @bfq_slice_idle: maximum idling time.
5592 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5593 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5594 ++ * async queues.
5595 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5596 ++ * to prevent seeky queues to impose long latencies to well
5597 ++ * behaved ones (this also implies that seeky queues cannot
5598 ++ * receive guarantees in the service domain; after a timeout
5599 ++ * they are charged for the whole allocated budget, to try
5600 ++ * to preserve a behavior reasonably fair among them, but
5601 ++ * without service-domain guarantees).
5602 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5603 ++ * queue is multiplied
5604 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5605 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5606 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5607 ++ * may be reactivated for a queue (in jiffies)
5608 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5609 ++ * after which weight-raising may be
5610 ++ * reactivated for an already busy queue
5611 ++ * (in jiffies)
5612 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5613 ++ * sectors per seconds
5614 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
5615 ++ * duration of the weight raising automatically
5616 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5617 ++ *
5618 ++ * All the fields are protected by the @queue lock.
5619 ++ */
5620 ++struct bfq_data {
5621 ++ struct request_queue *queue;
5622 ++
5623 ++ struct bfq_group *root_group;
5624 ++
5625 ++ struct rb_root rq_pos_tree;
5626 ++
5627 ++ int busy_queues;
5628 ++ int queued;
5629 ++ int rq_in_driver;
5630 ++ int sync_flight;
5631 ++
5632 ++ int max_rq_in_driver;
5633 ++ int hw_tag_samples;
5634 ++ int hw_tag;
5635 ++
5636 ++ int budgets_assigned;
5637 ++
5638 ++ struct timer_list idle_slice_timer;
5639 ++ struct work_struct unplug_work;
5640 ++
5641 ++ struct bfq_queue *active_queue;
5642 ++ struct bfq_io_cq *active_bic;
5643 ++
5644 ++ sector_t last_position;
5645 ++
5646 ++ ktime_t last_budget_start;
5647 ++ ktime_t last_idling_start;
5648 ++ int peak_rate_samples;
5649 ++ u64 peak_rate;
5650 ++ unsigned long bfq_max_budget;
5651 ++
5652 ++ struct hlist_head group_list;
5653 ++ struct list_head active_list;
5654 ++ struct list_head idle_list;
5655 ++
5656 ++ unsigned int bfq_quantum;
5657 ++ unsigned int bfq_fifo_expire[2];
5658 ++ unsigned int bfq_back_penalty;
5659 ++ unsigned int bfq_back_max;
5660 ++ unsigned int bfq_slice_idle;
5661 ++ u64 bfq_class_idle_last_service;
5662 ++
5663 ++ unsigned int bfq_user_max_budget;
5664 ++ unsigned int bfq_max_budget_async_rq;
5665 ++ unsigned int bfq_timeout[2];
5666 ++
5667 ++ bool low_latency;
5668 ++
5669 ++ /* parameters of the low_latency heuristics */
5670 ++ unsigned int bfq_raising_coeff;
5671 ++ unsigned int bfq_raising_max_time;
5672 ++ unsigned int bfq_raising_rt_max_time;
5673 ++ unsigned int bfq_raising_min_idle_time;
5674 ++ unsigned int bfq_raising_min_inter_arr_async;
5675 ++ unsigned int bfq_raising_max_softrt_rate;
5676 ++ u64 RT_prod;
5677 ++
5678 ++ struct bfq_queue oom_bfqq;
5679 ++};
5680 ++
5681 ++enum bfqq_state_flags {
5682 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
5683 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
5684 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
5685 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
5686 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
5687 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
5688 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
5689 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
5690 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
5691 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
5692 ++ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
5693 ++};
5694 ++
5695 ++#define BFQ_BFQQ_FNS(name) \
5696 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
5697 ++{ \
5698 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
5699 ++} \
5700 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
5701 ++{ \
5702 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
5703 ++} \
5704 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
5705 ++{ \
5706 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
5707 ++}
5708 ++
5709 ++BFQ_BFQQ_FNS(busy);
5710 ++BFQ_BFQQ_FNS(wait_request);
5711 ++BFQ_BFQQ_FNS(must_alloc);
5712 ++BFQ_BFQQ_FNS(fifo_expire);
5713 ++BFQ_BFQQ_FNS(idle_window);
5714 ++BFQ_BFQQ_FNS(prio_changed);
5715 ++BFQ_BFQQ_FNS(sync);
5716 ++BFQ_BFQQ_FNS(budget_new);
5717 ++BFQ_BFQQ_FNS(coop);
5718 ++BFQ_BFQQ_FNS(split_coop);
5719 ++BFQ_BFQQ_FNS(some_coop_idle);
5720 ++#undef BFQ_BFQQ_FNS
5721 ++
5722 ++/* Logging facilities. */
5723 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
5724 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
5725 ++
5726 ++#define bfq_log(bfqd, fmt, args...) \
5727 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
5728 ++
5729 ++/* Expiration reasons. */
5730 ++enum bfqq_expiration {
5731 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
5732 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
5733 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
5734 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
5735 ++};
5736 ++
5737 ++#ifdef CONFIG_CGROUP_BFQIO
5738 ++/**
5739 ++ * struct bfq_group - per (device, cgroup) data structure.
5740 ++ * @entity: schedulable entity to insert into the parent group sched_data.
5741 ++ * @sched_data: own sched_data, to contain child entities (they may be
5742 ++ * both bfq_queues and bfq_groups).
5743 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
5744 ++ * list of the containing cgroup's bfqio_cgroup.
5745 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
5746 ++ * of the groups active on the same device; used for cleanup.
5747 ++ * @bfqd: the bfq_data for the device this group acts upon.
5748 ++ * @async_bfqq: array of async queues for all the tasks belonging to
5749 ++ * the group, one queue per ioprio value per ioprio_class,
5750 ++ * except for the idle class that has only one queue.
5751 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
5752 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
5753 ++ * to avoid too many special cases during group creation/migration.
5754 ++ *
5755 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
5756 ++ * there is a set of bfq_groups, each one collecting the lower-level
5757 ++ * entities belonging to the group that are acting on the same device.
5758 ++ *
5759 ++ * Locking works as follows:
5760 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
5761 ++ * via RCU from its readers.
5762 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
5763 ++ * from the readers.
5764 ++ * o All the other fields are protected by the @bfqd queue lock.
5765 ++ */
5766 ++struct bfq_group {
5767 ++ struct bfq_entity entity;
5768 ++ struct bfq_sched_data sched_data;
5769 ++
5770 ++ struct hlist_node group_node;
5771 ++ struct hlist_node bfqd_node;
5772 ++
5773 ++ void *bfqd;
5774 ++
5775 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5776 ++ struct bfq_queue *async_idle_bfqq;
5777 ++
5778 ++ struct bfq_entity *my_entity;
5779 ++};
5780 ++
5781 ++/**
5782 ++ * struct bfqio_cgroup - bfq cgroup data structure.
5783 ++ * @css: subsystem state for bfq in the containing cgroup.
5784 ++ * @weight: cgroup weight.
5785 ++ * @ioprio: cgroup ioprio.
5786 ++ * @ioprio_class: cgroup ioprio_class.
5787 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
5788 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
5789 ++ *
5790 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
5791 ++ * @ioprio and @ioprio_class are protected by @lock.
5792 ++ */
5793 ++struct bfqio_cgroup {
5794 ++ struct cgroup_subsys_state css;
5795 ++
5796 ++ unsigned short weight, ioprio, ioprio_class;
5797 ++
5798 ++ spinlock_t lock;
5799 ++ struct hlist_head group_data;
5800 ++};
5801 ++#else
5802 ++struct bfq_group {
5803 ++ struct bfq_sched_data sched_data;
5804 ++
5805 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5806 ++ struct bfq_queue *async_idle_bfqq;
5807 ++};
5808 ++#endif
5809 ++
5810 ++static inline struct bfq_service_tree *
5811 ++bfq_entity_service_tree(struct bfq_entity *entity)
5812 ++{
5813 ++ struct bfq_sched_data *sched_data = entity->sched_data;
5814 ++ unsigned int idx = entity->ioprio_class - 1;
5815 ++
5816 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
5817 ++ BUG_ON(sched_data == NULL);
5818 ++
5819 ++ return sched_data->service_tree + idx;
5820 ++}
5821 ++
5822 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
5823 ++ int is_sync)
5824 ++{
5825 ++ return bic->bfqq[!!is_sync];
5826 ++}
5827 ++
5828 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
5829 ++ struct bfq_queue *bfqq, int is_sync)
5830 ++{
5831 ++ bic->bfqq[!!is_sync] = bfqq;
5832 ++}
5833 ++
5834 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
5835 ++{
5836 ++ return bic->icq.q->elevator->elevator_data;
5837 ++}
5838 ++
5839 ++/**
5840 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
5841 ++ * @ptr: a pointer to a bfqd.
5842 ++ * @flags: storage for the flags to be saved.
5843 ++ *
5844 ++ * This function allows bfqg->bfqd to be protected by the
5845 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
5846 ++ * under RCU, so the storage for bfqd is assured to be safe as long
5847 ++ * as the RCU read side critical section does not end. After the
5848 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
5849 ++ * sure that no other writer accessed it. If we raced with a writer,
5850 ++ * the function returns NULL, with the queue unlocked, otherwise it
5851 ++ * returns the dereferenced pointer, with the queue locked.
5852 ++ */
5853 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
5854 ++ unsigned long *flags)
5855 ++{
5856 ++ struct bfq_data *bfqd;
5857 ++
5858 ++ rcu_read_lock();
5859 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
5860 ++
5861 ++ if (bfqd != NULL) {
5862 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
5863 ++ if (*ptr == bfqd)
5864 ++ goto out;
5865 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5866 ++ }
5867 ++
5868 ++ bfqd = NULL;
5869 ++out:
5870 ++ rcu_read_unlock();
5871 ++ return bfqd;
5872 ++}
5873 ++
5874 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
5875 ++ unsigned long *flags)
5876 ++{
5877 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5878 ++}
5879 ++
5880 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
5881 ++static void bfq_put_queue(struct bfq_queue *bfqq);
5882 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
5883 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
5884 ++ struct bfq_group *bfqg, int is_sync,
5885 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
5886 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
5887 ++ struct bfq_group *bfqg);
5888 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
5889 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
5890 ++#endif
5891 +--
5892 +1.8.1.4
5893 +
5894
5895 Added: genpatches-2.6/trunk/3.10/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.9.0.patch1
5896 ===================================================================
5897 --- genpatches-2.6/trunk/3.10/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.9.0.patch1 (rev 0)
5898 +++ genpatches-2.6/trunk/3.10/1803_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v6r2-for-3.9.0.patch1 2013-07-01 07:02:35 UTC (rev 2423)
5899 @@ -0,0 +1,1049 @@
5900 +From 84032f90ea34f4d17b361eac4707793797db1461 Mon Sep 17 00:00:00 2001
5901 +From: Arianna Avanzini <avanzini.arianna@×××××.com>
5902 +Date: Fri, 14 Jun 2013 13:46:47 +0200
5903 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for
5904 + 3.9.0
5905 +
5906 +A set of processes may happen to perform interleaved reads, i.e., requests
5907 +whose union would give rise to a sequential read pattern. There are two
5908 +typical cases: in the first case, processes read fixed-size chunks of
5909 +data at a fixed distance from each other, while in the second case processes
5910 +may read variable-size chunks at variable distances. The latter case occurs
5911 +for example with KVM, which splits the I/O generated by the guest into
5912 +multiple chunks, and lets these chunks be served by a pool of cooperating
5913 +processes, iteratively assigning the next chunk of I/O to the first
5914 +available process. CFQ uses actual queue merging for the first type of
5915 +processes, whereas it uses preemption to get a sequential read pattern out
5916 +of the read requests performed by the second type of processes. In the end
5917 +it uses two different mechanisms to achieve the same goal: boosting the
5918 +throughput with interleaved I/O.
5919 +
5920 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
5921 +sequential read pattern with both types of processes. The main idea is
5922 +checking newly arrived requests against the next request of the active queue
5923 +both in case of actual request insert and in case of request merge. By doing
5924 +so, both the types of processes can be handled by just merging their queues.
5925 +EQM is then simpler and more compact than the pair of mechanisms used in
5926 +CFQ.
5927 +
5928 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
5929 +properly restoring the weight-raising state of a queue when it gets back to
5930 +a non-merged state.
5931 +
5932 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
5933 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
5934 +Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
5935 +---
5936 + block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------
5937 + block/bfq-sched.c | 28 ---
5938 + block/bfq.h | 16 ++
5939 + 3 files changed, 466 insertions(+), 231 deletions(-)
5940 +
5941 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
5942 +index b230927..bc57923 100644
5943 +--- a/block/bfq-iosched.c
5944 ++++ b/block/bfq-iosched.c
5945 +@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
5946 + return dur;
5947 + }
5948 +
5949 ++static inline void
5950 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
5951 ++{
5952 ++ if (bic->saved_idle_window)
5953 ++ bfq_mark_bfqq_idle_window(bfqq);
5954 ++ else
5955 ++ bfq_clear_bfqq_idle_window(bfqq);
5956 ++ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
5957 ++ /*
5958 ++ * Start a weight raising period with the duration given by
5959 ++ * the raising_time_left snapshot.
5960 ++ */
5961 ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
5962 ++ bfqq->raising_cur_max_time = bic->raising_time_left;
5963 ++ bfqq->last_rais_start_finish = jiffies;
5964 ++ }
5965 ++ /*
5966 ++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
5967 ++ * getting confused about the queue's need of a weight-raising
5968 ++ * period.
5969 ++ */
5970 ++ bic->raising_time_left = 0;
5971 ++}
5972 ++
5973 ++/*
5974 ++ * Must be called with the queue_lock held.
5975 ++ */
5976 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
5977 ++{
5978 ++ int process_refs, io_refs;
5979 ++
5980 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
5981 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
5982 ++ BUG_ON(process_refs < 0);
5983 ++ return process_refs;
5984 ++}
5985 ++
5986 + static void bfq_add_rq_rb(struct request *rq)
5987 + {
5988 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
5989 +@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)
5990 + if (! bfqd->low_latency)
5991 + goto add_bfqq_busy;
5992 +
5993 ++ if (bfq_bfqq_just_split(bfqq))
5994 ++ goto set_ioprio_changed;
5995 ++
5996 + /*
5997 +- * If the queue is not being boosted and has been idle
5998 +- * for enough time, start a weight-raising period
5999 ++ * If the queue:
6000 ++ * - is not being boosted,
6001 ++ * - has been idle for enough time,
6002 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
6003 ++ * shared "for its nature" or it is not shared and its
6004 ++ * requests have not been redirected to a shared queue)
6005 ++ * start a weight-raising period.
6006 + */
6007 +- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
6008 ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
6009 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
6010 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
6011 + if (idle_for_long_time)
6012 + bfqq->raising_cur_max_time =
6013 +@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)
6014 + raising_cur_max_time));
6015 + }
6016 + }
6017 ++set_ioprio_changed:
6018 + if (old_raising_coeff != bfqq->raising_coeff)
6019 + entity->ioprio_changed = 1;
6020 + add_bfqq_busy:
6021 +@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
6022 + spin_unlock_irq(bfqd->queue->queue_lock);
6023 + }
6024 +
6025 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6026 +- struct bio *bio)
6027 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
6028 + {
6029 +- struct bfq_data *bfqd = q->elevator->elevator_data;
6030 +- struct bfq_io_cq *bic;
6031 +- struct bfq_queue *bfqq;
6032 +-
6033 +- /*
6034 +- * Disallow merge of a sync bio into an async request.
6035 +- */
6036 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6037 +- return 0;
6038 +-
6039 +- /*
6040 +- * Lookup the bfqq that this bio will be queued with. Allow
6041 +- * merge only if rq is queued there.
6042 +- * Queue lock is held here.
6043 +- */
6044 +- bic = bfq_bic_lookup(bfqd, current->io_context);
6045 +- if (bic == NULL)
6046 +- return 0;
6047 +-
6048 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6049 +- return bfqq == RQ_BFQQ(rq);
6050 +-}
6051 +-
6052 +-static void __bfq_set_active_queue(struct bfq_data *bfqd,
6053 +- struct bfq_queue *bfqq)
6054 +-{
6055 +- if (bfqq != NULL) {
6056 +- bfq_mark_bfqq_must_alloc(bfqq);
6057 +- bfq_mark_bfqq_budget_new(bfqq);
6058 +- bfq_clear_bfqq_fifo_expire(bfqq);
6059 +-
6060 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6061 +-
6062 +- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6063 +- bfqq->entity.budget);
6064 +- }
6065 +-
6066 +- bfqd->active_queue = bfqq;
6067 +-}
6068 +-
6069 +-/*
6070 +- * Get and set a new active queue for service.
6071 +- */
6072 +-static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
6073 +- struct bfq_queue *bfqq)
6074 +-{
6075 +- if (!bfqq)
6076 +- bfqq = bfq_get_next_queue(bfqd);
6077 ++ if (request)
6078 ++ return blk_rq_pos(io_struct);
6079 + else
6080 +- bfq_get_next_queue_forced(bfqd, bfqq);
6081 +-
6082 +- __bfq_set_active_queue(bfqd, bfqq);
6083 +- return bfqq;
6084 ++ return ((struct bio *)io_struct)->bi_sector;
6085 + }
6086 +
6087 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
6088 +- struct request *rq)
6089 ++static inline sector_t bfq_dist_from(sector_t pos1,
6090 ++ sector_t pos2)
6091 + {
6092 +- if (blk_rq_pos(rq) >= bfqd->last_position)
6093 +- return blk_rq_pos(rq) - bfqd->last_position;
6094 ++ if (pos1 >= pos2)
6095 ++ return pos1 - pos2;
6096 + else
6097 +- return bfqd->last_position - blk_rq_pos(rq);
6098 ++ return pos2 - pos1;
6099 + }
6100 +
6101 +-/*
6102 +- * Return true if bfqq has no request pending and rq is close enough to
6103 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
6104 +- * bfqq->next_rq
6105 +- */
6106 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
6107 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
6108 ++ sector_t sector)
6109 + {
6110 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
6111 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
6112 ++ BFQQ_SEEK_THR;
6113 + }
6114 +
6115 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6116 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
6117 + {
6118 + struct rb_root *root = &bfqd->rq_pos_tree;
6119 + struct rb_node *parent, *node;
6120 + struct bfq_queue *__bfqq;
6121 +- sector_t sector = bfqd->last_position;
6122 +
6123 + if (RB_EMPTY_ROOT(root))
6124 + return NULL;
6125 +@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6126 + * position).
6127 + */
6128 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
6129 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6130 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6131 + return __bfqq;
6132 +
6133 + if (blk_rq_pos(__bfqq->next_rq) < sector)
6134 +@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6135 + return NULL;
6136 +
6137 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
6138 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6139 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6140 + return __bfqq;
6141 +
6142 + return NULL;
6143 +@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6144 + /*
6145 + * bfqd - obvious
6146 + * cur_bfqq - passed in so that we don't decide that the current queue
6147 +- * is closely cooperating with itself.
6148 +- *
6149 +- * We are assuming that cur_bfqq has dispatched at least one request,
6150 +- * and that bfqd->last_position reflects a position on the disk associated
6151 +- * with the I/O issued by cur_bfqq.
6152 ++ * is closely cooperating with itself
6153 ++ * sector - used as a reference point to search for a close queue
6154 + */
6155 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6156 +- struct bfq_queue *cur_bfqq)
6157 ++ struct bfq_queue *cur_bfqq,
6158 ++ sector_t sector)
6159 + {
6160 + struct bfq_queue *bfqq;
6161 +
6162 +@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6163 + * working closely on the same area of the disk. In that case,
6164 + * we can group them together and don't waste time idling.
6165 + */
6166 +- bfqq = bfqq_close(bfqd);
6167 ++ bfqq = bfqq_close(bfqd, sector);
6168 + if (bfqq == NULL || bfqq == cur_bfqq)
6169 + return NULL;
6170 +
6171 +@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6172 + return bfqq;
6173 + }
6174 +
6175 ++static struct bfq_queue *
6176 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6177 ++{
6178 ++ int process_refs, new_process_refs;
6179 ++ struct bfq_queue *__bfqq;
6180 ++
6181 ++ /*
6182 ++ * If there are no process references on the new_bfqq, then it is
6183 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6184 ++ * may have dropped their last reference (not just their last process
6185 ++ * reference).
6186 ++ */
6187 ++ if (!bfqq_process_refs(new_bfqq))
6188 ++ return NULL;
6189 ++
6190 ++ /* Avoid a circular list and skip interim queue merges. */
6191 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
6192 ++ if (__bfqq == bfqq)
6193 ++ return NULL;
6194 ++ new_bfqq = __bfqq;
6195 ++ }
6196 ++
6197 ++ process_refs = bfqq_process_refs(bfqq);
6198 ++ new_process_refs = bfqq_process_refs(new_bfqq);
6199 ++ /*
6200 ++ * If the process for the bfqq has gone away, there is no
6201 ++ * sense in merging the queues.
6202 ++ */
6203 ++ if (process_refs == 0 || new_process_refs == 0)
6204 ++ return NULL;
6205 ++
6206 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6207 ++ new_bfqq->pid);
6208 ++
6209 ++ /*
6210 ++ * Merging is just a redirection: the requests of the process owning
6211 ++ * one of the two queues are redirected to the other queue. The latter
6212 ++ * queue, in its turn, is set as shared if this is the first time that
6213 ++ * the requests of some process are redirected to it.
6214 ++ *
6215 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
6216 ++ * are in the context of the process owning bfqq, hence we have the
6217 ++ * io_cq of this process. So we can immediately configure this io_cq
6218 ++ * to redirect the requests of the process to new_bfqq.
6219 ++ *
6220 ++ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
6221 ++ * new_bfqq is not available, because, if the active queue is shared,
6222 ++ * bfqd->active_bic may not point to the io_cq of the active queue.
6223 ++ * Redirecting the requests of the process owning bfqq to the currently
6224 ++ * active queue is in any case the best option, as we feed the active queue
6225 ++ * with new requests close to the last request served and, by doing so,
6226 ++ * hopefully increase the throughput.
6227 ++ */
6228 ++ bfqq->new_bfqq = new_bfqq;
6229 ++ atomic_add(process_refs, &new_bfqq->ref);
6230 ++ return new_bfqq;
6231 ++}
6232 ++
6233 ++/*
6234 ++ * Attempt to schedule a merge of bfqq with the currently active queue or
6235 ++ * with a close queue among the scheduled queues.
6236 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
6237 ++ * structure otherwise.
6238 ++ */
6239 ++static struct bfq_queue *
6240 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6241 ++ void *io_struct, bool request)
6242 ++{
6243 ++ struct bfq_queue *active_bfqq, *new_bfqq;
6244 ++
6245 ++ if (bfqq->new_bfqq)
6246 ++ return bfqq->new_bfqq;
6247 ++
6248 ++ if (!io_struct)
6249 ++ return NULL;
6250 ++
6251 ++ active_bfqq = bfqd->active_queue;
6252 ++
6253 ++ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
6254 ++ goto check_scheduled;
6255 ++
6256 ++ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
6257 ++ goto check_scheduled;
6258 ++
6259 ++ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
6260 ++ goto check_scheduled;
6261 ++
6262 ++ if (active_bfqq->entity.parent != bfqq->entity.parent)
6263 ++ goto check_scheduled;
6264 ++
6265 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
6266 ++ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
6267 ++ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
6268 ++ return new_bfqq; /* Merge with the active queue */
6269 ++
6270 ++ /*
6271 ++ * Check whether there is a cooperator among currently scheduled
6272 ++ * queues. The only thing we need is that the bio/request is not
6273 ++ * NULL, as we need it to establish whether a cooperator exists.
6274 ++ */
6275 ++check_scheduled:
6276 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
6277 ++ bfq_io_struct_pos(io_struct, request));
6278 ++ if (new_bfqq)
6279 ++ return bfq_setup_merge(bfqq, new_bfqq);
6280 ++
6281 ++ return NULL;
6282 ++}
6283 ++
6284 ++static inline void
6285 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
6286 ++{
6287 ++ /*
6288 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
6289 ++ * have already been redirected to a shared queue; both idle window
6290 ++ * and weight raising state have already been saved. Do nothing.
6291 ++ */
6292 ++ if (bfqq->bic == NULL)
6293 ++ return;
6294 ++ if (bfqq->bic->raising_time_left)
6295 ++ /*
6296 ++ * This is the queue of a just-started process, and would
6297 ++ * deserve weight raising: we set raising_time_left to the full
6298 ++ * weight-raising duration to trigger weight-raising when and
6299 ++ * if the queue is split and the first request of the queue
6300 ++ * is enqueued.
6301 ++ */
6302 ++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
6303 ++ else if (bfqq->raising_coeff > 1) {
6304 ++ unsigned long wrais_duration =
6305 ++ jiffies - bfqq->last_rais_start_finish;
6306 ++ /*
6307 ++ * It may happen that a queue's weight raising period lasts
6308 ++ * longer than its raising_cur_max_time, as weight raising is
6309 ++ * handled only when a request is enqueued or dispatched (it
6310 ++ * does not use any timer). If the weight raising period is
6311 ++ * about to end, don't save it.
6312 ++ */
6313 ++ if (bfqq->raising_cur_max_time <= wrais_duration)
6314 ++ bfqq->bic->raising_time_left = 0;
6315 ++ else
6316 ++ bfqq->bic->raising_time_left =
6317 ++ bfqq->raising_cur_max_time - wrais_duration;
6318 ++ /*
6319 ++ * The bfq_queue is becoming shared or the requests of the
6320 ++ * process owning the queue are being redirected to a shared
6321 ++ * queue. Stop the weight raising period of the queue, as in
6322 ++ * both cases it should not be owned by an interactive or soft
6323 ++ * real-time application.
6324 ++ */
6325 ++ bfq_bfqq_end_raising(bfqq);
6326 ++ } else
6327 ++ bfqq->bic->raising_time_left = 0;
6328 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
6329 ++}
6330 ++
6331 ++static inline void
6332 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
6333 ++{
6334 ++ /*
6335 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
6336 ++ * is about to begin using a shared bfq_queue.
6337 ++ */
6338 ++ if (bfqq->bic)
6339 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
6340 ++}
6341 ++
6342 ++static void
6343 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6344 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6345 ++{
6346 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6347 ++ (long unsigned)new_bfqq->pid);
6348 ++ /* Save weight raising and idle window of the merged queues */
6349 ++ bfq_bfqq_save_state(bfqq);
6350 ++ bfq_bfqq_save_state(new_bfqq);
6351 ++ /*
6352 ++ * Grab a reference to the bic, to prevent it from being destroyed
6353 ++ * before being possibly touched by a bfq_split_bfqq().
6354 ++ */
6355 ++ bfq_get_bic_reference(bfqq);
6356 ++ bfq_get_bic_reference(new_bfqq);
6357 ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
6358 ++ bic_set_bfqq(bic, new_bfqq, 1);
6359 ++ bfq_mark_bfqq_coop(new_bfqq);
6360 ++ /*
6361 ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set
6362 ++ * new_bfqq->bic to NULL. bfqq either:
6363 ++ * - does not belong to any bic any more, and hence bfqq->bic must
6364 ++ * be set to NULL, or
6365 ++ * - is a queue whose owning bics have already been redirected to a
6366 ++ * different queue, hence the queue is destined to not belong to any
6367 ++ * bic soon and bfqq->bic is already NULL (therefore the next
6368 ++ * assignment causes no harm).
6369 ++ */
6370 ++ new_bfqq->bic = NULL;
6371 ++ bfqq->bic = NULL;
6372 ++ bfq_put_queue(bfqq);
6373 ++}
6374 ++
6375 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6376 ++ struct bio *bio)
6377 ++{
6378 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
6379 ++ struct bfq_io_cq *bic;
6380 ++ struct bfq_queue *bfqq, *new_bfqq;
6381 ++
6382 ++ /*
6383 ++ * Disallow merge of a sync bio into an async request.
6384 ++ */
6385 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6386 ++ return 0;
6387 ++
6388 ++ /*
6389 ++ * Lookup the bfqq that this bio will be queued with. Allow
6390 ++ * merge only if rq is queued there.
6391 ++ * Queue lock is held here.
6392 ++ */
6393 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
6394 ++ if (bic == NULL)
6395 ++ return 0;
6396 ++
6397 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6398 ++ /*
6399 ++ * We take advantage of this function to perform an early merge
6400 ++ * of the queues of possible cooperating processes.
6401 ++ */
6402 ++ if (bfqq != NULL &&
6403 ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
6404 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
6405 ++ /*
6406 ++ * If we get here, the bio will be queued in the shared queue,
6407 ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
6408 ++ * rq can be merged.
6409 ++ */
6410 ++ bfqq = new_bfqq;
6411 ++ }
6412 ++
6413 ++ return bfqq == RQ_BFQQ(rq);
6414 ++}
6415 ++
6416 ++static void __bfq_set_active_queue(struct bfq_data *bfqd,
6417 ++ struct bfq_queue *bfqq)
6418 ++{
6419 ++ if (bfqq != NULL) {
6420 ++ bfq_mark_bfqq_must_alloc(bfqq);
6421 ++ bfq_mark_bfqq_budget_new(bfqq);
6422 ++ bfq_clear_bfqq_fifo_expire(bfqq);
6423 ++
6424 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6425 ++
6426 ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6427 ++ bfqq->entity.budget);
6428 ++ }
6429 ++
6430 ++ bfqd->active_queue = bfqq;
6431 ++}
6432 ++
6433 ++/*
6434 ++ * Get and set a new active queue for service.
6435 ++ */
6436 ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
6437 ++{
6438 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
6439 ++
6440 ++ __bfq_set_active_queue(bfqd, bfqq);
6441 ++ return bfqq;
6442 ++}
6443 ++
6444 + /*
6445 + * If enough samples have been computed, return the current max budget
6446 + * stored in bfqd, which is dynamically updated according to the
6447 +@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
6448 + return rq;
6449 + }
6450 +
6451 +-/*
6452 +- * Must be called with the queue_lock held.
6453 +- */
6454 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
6455 +-{
6456 +- int process_refs, io_refs;
6457 +-
6458 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
6459 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
6460 +- BUG_ON(process_refs < 0);
6461 +- return process_refs;
6462 +-}
6463 +-
6464 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6465 +-{
6466 +- int process_refs, new_process_refs;
6467 +- struct bfq_queue *__bfqq;
6468 +-
6469 +- /*
6470 +- * If there are no process references on the new_bfqq, then it is
6471 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6472 +- * may have dropped their last reference (not just their last process
6473 +- * reference).
6474 +- */
6475 +- if (!bfqq_process_refs(new_bfqq))
6476 +- return;
6477 +-
6478 +- /* Avoid a circular list and skip interim queue merges. */
6479 +- while ((__bfqq = new_bfqq->new_bfqq)) {
6480 +- if (__bfqq == bfqq)
6481 +- return;
6482 +- new_bfqq = __bfqq;
6483 +- }
6484 +-
6485 +- process_refs = bfqq_process_refs(bfqq);
6486 +- new_process_refs = bfqq_process_refs(new_bfqq);
6487 +- /*
6488 +- * If the process for the bfqq has gone away, there is no
6489 +- * sense in merging the queues.
6490 +- */
6491 +- if (process_refs == 0 || new_process_refs == 0)
6492 +- return;
6493 +-
6494 +- /*
6495 +- * Merge in the direction of the lesser amount of work.
6496 +- */
6497 +- if (new_process_refs >= process_refs) {
6498 +- bfqq->new_bfqq = new_bfqq;
6499 +- atomic_add(process_refs, &new_bfqq->ref);
6500 +- } else {
6501 +- new_bfqq->new_bfqq = bfqq;
6502 +- atomic_add(new_process_refs, &bfqq->ref);
6503 +- }
6504 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6505 +- new_bfqq->pid);
6506 +-}
6507 +-
6508 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
6509 + {
6510 + struct bfq_entity *entity = &bfqq->entity;
6511 +@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
6512 + * is likely to boost the disk throughput);
6513 + * - the queue is weight-raised (waiting for the request is necessary for
6514 + * providing the queue with fairness and latency guarantees).
6515 ++ *
6516 ++ * In any case, idling can be disabled for cooperation issues, if
6517 ++ * 1) there is a close cooperator for the queue, or
6518 ++ * 2) the queue is shared and some cooperator is likely to be idle (in this
6519 ++ * case, by not arming the idle timer, we try to slow down the queue, to
6520 ++ * prevent the zones of the disk accessed by the active cooperators to
6521 ++ * become too distant from the zone that will be accessed by the currently
6522 ++ * idle cooperators).
6523 + */
6524 + static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6525 + int budg_timeout)
6526 +@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6527 + (bfqd->rq_in_driver == 0 ||
6528 + budg_timeout ||
6529 + bfqq->raising_coeff > 1) &&
6530 +- !bfq_close_cooperator(bfqd, bfqq) &&
6531 ++ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&
6532 + (!bfq_bfqq_coop(bfqq) ||
6533 + !bfq_bfqq_some_coop_idle(bfqq)) &&
6534 + !bfq_queue_nonrot_noidle(bfqd, bfqq));
6535 +@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6536 + */
6537 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6538 + {
6539 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
6540 ++ struct bfq_queue *bfqq;
6541 + struct request *next_rq;
6542 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
6543 + int budg_timeout;
6544 +@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6545 +
6546 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
6547 +
6548 +- /*
6549 +- * If another queue has a request waiting within our mean seek
6550 +- * distance, let it run. The expire code will check for close
6551 +- * cooperators and put the close queue at the front of the
6552 +- * service tree. If possible, merge the expiring queue with the
6553 +- * new bfqq.
6554 +- */
6555 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
6556 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
6557 +- bfq_setup_merge(bfqq, new_bfqq);
6558 +-
6559 + budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
6560 + if (budg_timeout &&
6561 + !bfq_bfqq_must_idle(bfqq, budg_timeout))
6562 +@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6563 + bfq_clear_bfqq_wait_request(bfqq);
6564 + del_timer(&bfqd->idle_slice_timer);
6565 + }
6566 +- if (new_bfqq == NULL)
6567 +- goto keep_queue;
6568 +- else
6569 +- goto expire;
6570 ++ goto keep_queue;
6571 + }
6572 + }
6573 +
6574 +@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6575 + * queue still has requests in flight or is idling for a new request,
6576 + * then keep it.
6577 + */
6578 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
6579 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
6580 + (bfqq->dispatched != 0 &&
6581 + (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
6582 +- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
6583 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
6584 + bfqq = NULL;
6585 + goto keep_queue;
6586 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
6587 +- /*
6588 +- * Expiring the queue because there is a close cooperator,
6589 +- * cancel timer.
6590 +- */
6591 +- bfq_clear_bfqq_wait_request(bfqq);
6592 +- del_timer(&bfqd->idle_slice_timer);
6593 + }
6594 +
6595 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
6596 + expire:
6597 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
6598 + new_queue:
6599 +- bfqq = bfq_set_active_queue(bfqd, new_bfqq);
6600 ++ bfqq = bfq_set_active_queue(bfqd);
6601 + bfq_log(bfqd, "select_queue: new queue %d returned",
6602 + bfqq != NULL ? bfqq->pid : 0);
6603 + keep_queue:
6604 +@@ -1617,9 +1807,8 @@ keep_queue:
6605 +
6606 + static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6607 + {
6608 ++ struct bfq_entity *entity = &bfqq->entity;
6609 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
6610 +- struct bfq_entity *entity = &bfqq->entity;
6611 +-
6612 + bfq_log_bfqq(bfqd, bfqq,
6613 + "raising period dur %u/%u msec, "
6614 + "old raising coeff %u, w %d(%d)",
6615 +@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6616 + jiffies_to_msecs(bfqq->
6617 + raising_cur_max_time));
6618 + bfq_bfqq_end_raising(bfqq);
6619 +- __bfq_entity_update_weight_prio(
6620 +- bfq_entity_service_tree(entity),
6621 +- entity);
6622 + }
6623 + }
6624 + }
6625 ++ /* Update weight both if it must be raised and if it must be lowered */
6626 ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
6627 ++ __bfq_entity_update_weight_prio(
6628 ++ bfq_entity_service_tree(entity),
6629 ++ entity);
6630 + }
6631 +
6632 + /*
6633 +@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)
6634 + struct bfq_io_cq *bic = icq_to_bic(icq);
6635 +
6636 + bic->ttime.last_end_request = jiffies;
6637 ++ /*
6638 ++ * A newly created bic indicates that the process has just
6639 ++ * started doing I/O, and is probably mapping into memory its
6640 ++ * executable and libraries: it definitely needs weight raising.
6641 ++ * There is however the possibility that the process performs,
6642 ++ * for a while, I/O close to some other process. EQM intercepts
6643 ++ * this behavior and may merge the queue corresponding to the
6644 ++ * process with some other queue, BEFORE the weight of the queue
6645 ++ * is raised. Merged queues are not weight-raised (they are assumed
6646 ++ * to belong to processes that benefit only from high throughput).
6647 ++ * If the merge is basically the consequence of an accident, then
6648 ++ * the queue will be split soon and will get back its old weight.
6649 ++ * It is then important to write down somewhere that this queue
6650 ++ * does need weight raising, even if it did not make it to get its
6651 ++ * weight raised before being merged. To this purpose, we overload
6652 ++ * the field raising_time_left and assign 1 to it, to mark the queue
6653 ++ * as needing weight raising.
6654 ++ */
6655 ++ bic->raising_time_left = 1;
6656 + }
6657 +
6658 + static void bfq_exit_icq(struct io_cq *icq)
6659 +@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)
6660 + }
6661 +
6662 + if (bic->bfqq[BLK_RW_SYNC]) {
6663 ++ /*
6664 ++ * If the bic is using a shared queue, put the reference
6665 ++ * taken on the io_context when the bic started using a
6666 ++ * shared bfq_queue.
6667 ++ */
6668 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
6669 ++ put_io_context(icq->ioc);
6670 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
6671 + bic->bfqq[BLK_RW_SYNC] = NULL;
6672 + }
6673 +@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
6674 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
6675 + return;
6676 +
6677 ++ /* Idle window just restored, statistics are meaningless. */
6678 ++ if (bfq_bfqq_just_split(bfqq))
6679 ++ return;
6680 ++
6681 + enable_idle = bfq_bfqq_idle_window(bfqq);
6682 +
6683 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
6684 +@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6685 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
6686 + !BFQQ_SEEKY(bfqq))
6687 + bfq_update_idle_window(bfqd, bfqq, bic);
6688 ++ bfq_clear_bfqq_just_split(bfqq);
6689 +
6690 + bfq_log_bfqq(bfqd, bfqq,
6691 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
6692 +@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6693 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
6694 + {
6695 + struct bfq_data *bfqd = q->elevator->elevator_data;
6696 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
6697 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
6698 +
6699 + assert_spin_locked(bfqd->queue->queue_lock);
6700 ++
6701 ++ /*
6702 ++ * An unplug may trigger a requeue of a request from the device
6703 ++ * driver: make sure we are in process context while trying to
6704 ++ * merge two bfq_queues.
6705 ++ */
6706 ++ if (!in_interrupt() &&
6707 ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
6708 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
6709 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
6710 ++ /*
6711 ++ * Release the request's reference to the old bfqq
6712 ++ * and make sure one is taken to the shared queue.
6713 ++ */
6714 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
6715 ++ bfqq->allocated[rq_data_dir(rq)]--;
6716 ++ atomic_inc(&new_bfqq->ref);
6717 ++ bfq_put_queue(bfqq);
6718 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
6719 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
6720 ++ rq->elv.priv[1] = new_bfqq;
6721 ++ bfqq = new_bfqq;
6722 ++ }
6723 ++
6724 + bfq_init_prio_data(bfqq, RQ_BIC(rq));
6725 +
6726 + bfq_add_rq_rb(rq);
6727 +
6728 ++ /*
6729 ++ * Here a newly-created bfq_queue has already started a weight-raising
6730 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
6731 ++ * from assigning it a full weight-raising period. See the detailed
6732 ++ * comments about this field in bfq_init_icq().
6733 ++ */
6734 ++ if (bfqq->bic != NULL)
6735 ++ bfqq->bic->raising_time_left = 0;
6736 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
6737 + list_add_tail(&rq->queuelist, &bfqq->fifo);
6738 +
6739 +@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
6740 + if (bfq_bfqq_budget_new(bfqq))
6741 + bfq_set_budget_timeout(bfqd);
6742 +
6743 +- /* Idling is disabled also for cooperation issues:
6744 +- * 1) there is a close cooperator for the queue, or
6745 +- * 2) the queue is shared and some cooperator is likely
6746 +- * to be idle (in this case, by not arming the idle timer,
6747 +- * we try to slow down the queue, to prevent the zones
6748 +- * of the disk accessed by the active cooperators to become
6749 +- * too distant from the zone that will be accessed by the
6750 +- * currently idle cooperators)
6751 +- */
6752 + if (bfq_bfqq_must_idle(bfqq, budg_timeout))
6753 + bfq_arm_slice_timer(bfqd);
6754 + else if (budg_timeout)
6755 +@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)
6756 + }
6757 + }
6758 +
6759 +-static struct bfq_queue *
6760 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6761 +- struct bfq_queue *bfqq)
6762 +-{
6763 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6764 +- (long unsigned)bfqq->new_bfqq->pid);
6765 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
6766 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
6767 +- bfq_put_queue(bfqq);
6768 +- return bic_to_bfqq(bic, 1);
6769 +-}
6770 +-
6771 + /*
6772 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
6773 + * was the last process referring to said bfqq.
6774 +@@ -2469,6 +2702,9 @@ static struct bfq_queue *
6775 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
6776 + {
6777 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
6778 ++
6779 ++ put_io_context(bic->icq.ioc);
6780 ++
6781 + if (bfqq_process_refs(bfqq) == 1) {
6782 + bfqq->pid = current->pid;
6783 + bfq_clear_bfqq_some_coop_idle(bfqq);
6784 +@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
6785 + struct bfq_queue *bfqq;
6786 + struct bfq_group *bfqg;
6787 + unsigned long flags;
6788 ++ bool split = false;
6789 +
6790 + might_sleep_if(gfp_mask & __GFP_WAIT);
6791 +
6792 +@@ -2516,24 +2753,14 @@ new_queue:
6793 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
6794 + bic_set_bfqq(bic, bfqq, is_sync);
6795 + } else {
6796 +- /*
6797 +- * If the queue was seeky for too long, break it apart.
6798 +- */
6799 ++ /* If the queue was seeky for too long, break it apart. */
6800 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
6801 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
6802 + bfqq = bfq_split_bfqq(bic, bfqq);
6803 ++ split = true;
6804 + if (!bfqq)
6805 + goto new_queue;
6806 + }
6807 +-
6808 +- /*
6809 +- * Check to see if this queue is scheduled to merge with
6810 +- * another closely cooperating queue. The merging of queues
6811 +- * happens here as it must be done in process context.
6812 +- * The reference on new_bfqq was taken in merge_bfqqs.
6813 +- */
6814 +- if (bfqq->new_bfqq != NULL)
6815 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
6816 + }
6817 +
6818 + bfqq->allocated[rw]++;
6819 +@@ -2544,6 +2771,26 @@ new_queue:
6820 + rq->elv.priv[0] = bic;
6821 + rq->elv.priv[1] = bfqq;
6822 +
6823 ++ /*
6824 ++ * If a bfq_queue has only one process reference, it is owned
6825 ++ * by only one bfq_io_cq: we can set the bic field of the
6826 ++ * bfq_queue to the address of that structure. Also, if the
6827 ++ * queue has just been split, mark a flag so that the
6828 ++ * information is available to the other scheduler hooks.
6829 ++ */
6830 ++ if (bfqq_process_refs(bfqq) == 1) {
6831 ++ bfqq->bic = bic;
6832 ++ if (split) {
6833 ++ bfq_mark_bfqq_just_split(bfqq);
6834 ++ /*
6835 ++ * If the queue has just been split from a shared queue,
6836 ++ * restore the idle window and the possible weight
6837 ++ * raising period.
6838 ++ */
6839 ++ bfq_bfqq_resume_state(bfqq, bic);
6840 ++ }
6841 ++ }
6842 ++
6843 + spin_unlock_irqrestore(q->queue_lock, flags);
6844 +
6845 + return 0;
6846 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
6847 +index 03f8061..a0edaa2 100644
6848 +--- a/block/bfq-sched.c
6849 ++++ b/block/bfq-sched.c
6850 +@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
6851 + return bfqq;
6852 + }
6853 +
6854 +-/*
6855 +- * Forced extraction of the given queue.
6856 +- */
6857 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
6858 +- struct bfq_queue *bfqq)
6859 +-{
6860 +- struct bfq_entity *entity;
6861 +- struct bfq_sched_data *sd;
6862 +-
6863 +- BUG_ON(bfqd->active_queue != NULL);
6864 +-
6865 +- entity = &bfqq->entity;
6866 +- /*
6867 +- * Bubble up extraction/update from the leaf to the root.
6868 +- */
6869 +- for_each_entity(entity) {
6870 +- sd = entity->sched_data;
6871 +- bfq_update_budget(entity);
6872 +- bfq_update_vtime(bfq_entity_service_tree(entity));
6873 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
6874 +- sd->active_entity = entity;
6875 +- sd->next_active = NULL;
6876 +- entity->service = 0;
6877 +- }
6878 +-
6879 +- return;
6880 +-}
6881 +-
6882 + static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
6883 + {
6884 + if (bfqd->active_bic != NULL) {
6885 +diff --git a/block/bfq.h b/block/bfq.h
6886 +index b4e9ab0..ca5b444 100644
6887 +--- a/block/bfq.h
6888 ++++ b/block/bfq.h
6889 +@@ -188,6 +188,8 @@ struct bfq_group;
6890 + * @pid: pid of the process owning the queue, used for logging purposes.
6891 + * @last_rais_start_time: last (idle -> weight-raised) transition attempt
6892 + * @raising_cur_max_time: current max raising time for this queue
6893 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
6894 ++ * queue is shared
6895 + *
6896 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
6897 + * or more (if it is an async one). @cgroup holds a reference to the
6898 +@@ -231,6 +233,7 @@ struct bfq_queue {
6899 + sector_t last_request_pos;
6900 +
6901 + pid_t pid;
6902 ++ struct bfq_io_cq *bic;
6903 +
6904 + /* weight-raising fields */
6905 + unsigned int raising_cur_max_time;
6906 +@@ -257,12 +260,23 @@ struct bfq_ttime {
6907 + * @icq: associated io_cq structure
6908 + * @bfqq: array of two process queues, the sync and the async
6909 + * @ttime: associated @bfq_ttime struct
6910 ++ * @raising_time_left: snapshot of the time left before weight raising ends
6911 ++ * for the sync queue associated to this process; this
6912 ++ * snapshot is taken to remember this value while the weight
6913 ++ * raising is suspended because the queue is merged with a
6914 ++ * shared queue, and is used to set @raising_cur_max_time
6915 ++ * when the queue is split from the shared queue and its
6916 ++ * weight is raised again
6917 ++ * @saved_idle_window: same purpose as the previous field for the idle window
6918 + */
6919 + struct bfq_io_cq {
6920 + struct io_cq icq; /* must be the first member */
6921 + struct bfq_queue *bfqq[2];
6922 + struct bfq_ttime ttime;
6923 + int ioprio;
6924 ++
6925 ++ unsigned int raising_time_left;
6926 ++ unsigned int saved_idle_window;
6927 + };
6928 +
6929 + /**
6930 +@@ -403,6 +417,7 @@ enum bfqq_state_flags {
6931 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6932 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6933 + BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
6934 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
6935 + };
6936 +
6937 + #define BFQ_BFQQ_FNS(name) \
6938 +@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);
6939 + BFQ_BFQQ_FNS(coop);
6940 + BFQ_BFQQ_FNS(split_coop);
6941 + BFQ_BFQQ_FNS(some_coop_idle);
6942 ++BFQ_BFQQ_FNS(just_split);
6943 + #undef BFQ_BFQQ_FNS
6944 +
6945 + /* Logging facilities. */
6946 +--
6947 +1.8.1.4
6948 +