Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.3 commit in: /
Date: Fri, 06 Nov 2015 00:24:32
Message-Id: 1446769452.2f6cc0b28617fe4c94729933e6aef823e8cd9773.mpagano@gentoo
1 commit: 2f6cc0b28617fe4c94729933e6aef823e8cd9773
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Fri Nov 6 00:24:12 2015 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Fri Nov 6 00:24:12 2015 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=2f6cc0b2
7
8 BFQ Patches v7r8.
9
10 0000_README | 13 +
11 ...roups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch | 104 +
12 ...introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1 | 6952 ++++++++++++++++++++
13 ...Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch | 1220 ++++
14 4 files changed, 8289 insertions(+)
15
16 diff --git a/0000_README b/0000_README
17 index 8e70e78..4c2a487 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -71,6 +71,19 @@ Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch
21 From: https://github.com/graysky2/kernel_gcc_patch/
22 Desc: Kernel patch enables gcc < v4.9 optimizations for additional CPUs.
23
24 +Patch: 5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch
25 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
26 +Desc: BFQ v7r8 patch 1 for 4.3: Build, cgroups and kconfig bits
27 +
28 +Patch: 5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1
29 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
30 +Desc: BFQ v7r8 patch 2 for 4.3: BFQ Scheduler
31 +
32 +Patch: 5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch
33 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
34 +Desc: BFQ v7r8 patch 3 for 4.3: Early Queue Merge (EQM)
35 +
36 Patch: 5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch
37 From: https://github.com/graysky2/kernel_gcc_patch/
38 Desc: Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.
39 +
40
41 diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch
42 new file mode 100644
43 index 0000000..76440b8
44 --- /dev/null
45 +++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.3.patch
46 @@ -0,0 +1,104 @@
47 +From 6a88d12f19b7c5578cf5d17a5e61fb0af75fa0d7 Mon Sep 17 00:00:00 2001
48 +From: Paolo Valente <paolo.valente@×××××××.it>
49 +Date: Tue, 7 Apr 2015 13:39:12 +0200
50 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r8-4.3
51 +
52 +Update Kconfig.iosched and do the related Makefile changes to include
53 +kernel configuration options for BFQ. Also add the bfqio controller
54 +to the cgroups subsystem.
55 +
56 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
57 +Signed-off-by: Arianna Avanzini <avanzini@××××××.com>
58 +---
59 + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
60 + block/Makefile | 1 +
61 + include/linux/cgroup_subsys.h | 4 ++++
62 + 3 files changed, 37 insertions(+)
63 +
64 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
65 +index 421bef9..0ee5f0f 100644
66 +--- a/block/Kconfig.iosched
67 ++++ b/block/Kconfig.iosched
68 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
69 + ---help---
70 + Enable group IO scheduling in CFQ.
71 +
72 ++config IOSCHED_BFQ
73 ++ tristate "BFQ I/O scheduler"
74 ++ default n
75 ++ ---help---
76 ++ The BFQ I/O scheduler tries to distribute bandwidth among
77 ++ all processes according to their weights.
78 ++ It aims at distributing the bandwidth as desired, independently of
79 ++ the disk parameters and with any workload. It also tries to
80 ++ guarantee low latency to interactive and soft real-time
81 ++ applications. If compiled built-in (saying Y here), BFQ can
82 ++ be configured to support hierarchical scheduling.
83 ++
84 ++config CGROUP_BFQIO
85 ++ bool "BFQ hierarchical scheduling support"
86 ++ depends on CGROUPS && IOSCHED_BFQ=y
87 ++ default n
88 ++ ---help---
89 ++ Enable hierarchical scheduling in BFQ, using the cgroups
90 ++ filesystem interface. The name of the subsystem will be
91 ++ bfqio.
92 ++
93 + choice
94 + prompt "Default I/O scheduler"
95 + default DEFAULT_CFQ
96 +@@ -52,6 +73,16 @@ choice
97 + config DEFAULT_CFQ
98 + bool "CFQ" if IOSCHED_CFQ=y
99 +
100 ++ config DEFAULT_BFQ
101 ++ bool "BFQ" if IOSCHED_BFQ=y
102 ++ help
103 ++ Selects BFQ as the default I/O scheduler which will be
104 ++ used by default for all block devices.
105 ++ The BFQ I/O scheduler aims at distributing the bandwidth
106 ++ as desired, independently of the disk parameters and with
107 ++ any workload. It also tries to guarantee low latency to
108 ++ interactive and soft real-time applications.
109 ++
110 + config DEFAULT_NOOP
111 + bool "No-op"
112 +
113 +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
114 + string
115 + default "deadline" if DEFAULT_DEADLINE
116 + default "cfq" if DEFAULT_CFQ
117 ++ default "bfq" if DEFAULT_BFQ
118 + default "noop" if DEFAULT_NOOP
119 +
120 + endmenu
121 +diff --git a/block/Makefile b/block/Makefile
122 +index 00ecc97..1ed86d5 100644
123 +--- a/block/Makefile
124 ++++ b/block/Makefile
125 +@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
126 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
127 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
128 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
129 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
130 +
131 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
132 + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
133 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
134 +index 1a96fda..81ad8a0 100644
135 +--- a/include/linux/cgroup_subsys.h
136 ++++ b/include/linux/cgroup_subsys.h
137 +@@ -46,6 +46,10 @@ SUBSYS(freezer)
138 + SUBSYS(net_cls)
139 + #endif
140 +
141 ++#if IS_ENABLED(CONFIG_CGROUP_BFQIO)
142 ++SUBSYS(bfqio)
143 ++#endif
144 ++
145 + #if IS_ENABLED(CONFIG_CGROUP_PERF)
146 + SUBSYS(perf_event)
147 + #endif
148 +--
149 +1.9.1
150 +
151
152 diff --git a/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1 b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1
153 new file mode 100644
154 index 0000000..43196b2
155 --- /dev/null
156 +++ b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.3.patch1
157 @@ -0,0 +1,6952 @@
158 +From ec474da4d0e3f9eb7860496802b6693333687bb5 Mon Sep 17 00:00:00 2001
159 +From: Paolo Valente <paolo.valente@×××××××.it>
160 +Date: Thu, 9 May 2013 19:10:02 +0200
161 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r8 I/O sched for 4.3
162 +
163 +Add the BFQ-v7r8 I/O scheduler to 4.3.
164 +The general structure is borrowed from CFQ, as much of the code for
165 +handling I/O contexts. Over time, several useful features have been
166 +ported from CFQ as well (details in the changelog in README.BFQ). A
167 +(bfq_)queue is associated to each task doing I/O on a device, and each
168 +time a scheduling decision has to be made a queue is selected and served
169 +until it expires.
170 +
171 + - Slices are given in the service domain: tasks are assigned
172 + budgets, measured in number of sectors. Once got the disk, a task
173 + must however consume its assigned budget within a configurable
174 + maximum time (by default, the maximum possible value of the
175 + budgets is automatically computed to comply with this timeout).
176 + This allows the desired latency vs "throughput boosting" tradeoff
177 + to be set.
178 +
179 + - Budgets are scheduled according to a variant of WF2Q+, implemented
180 + using an augmented rb-tree to take eligibility into account while
181 + preserving an O(log N) overall complexity.
182 +
183 + - A low-latency tunable is provided; if enabled, both interactive
184 + and soft real-time applications are guaranteed a very low latency.
185 +
186 + - Latency guarantees are preserved also in the presence of NCQ.
187 +
188 + - Also with flash-based devices, a high throughput is achieved
189 + while still preserving latency guarantees.
190 +
191 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
192 + cooperating-queue-merging and the preemption mechanisms present
193 + in CFQ. EQM is in fact a unified mechanism that tries to get a
194 + sequential read pattern, and hence a high throughput, with any
195 + set of processes performing interleaved I/O over a contiguous
196 + sequence of sectors.
197 +
198 + - BFQ supports full hierarchical scheduling, exporting a cgroups
199 + interface. Since each node has a full scheduler, each group can
200 + be assigned its own weight.
201 +
202 + - If the cgroups interface is not used, only I/O priorities can be
203 + assigned to processes, with ioprio values mapped to weights
204 + with the relation weight = IOPRIO_BE_NR - ioprio.
205 +
206 + - ioprio classes are served in strict priority order, i.e., lower
207 + priority queues are not served as long as there are higher
208 + priority queues. Among queues in the same class the bandwidth is
209 + distributed in proportion to the weight of each queue. A very
210 + thin extra bandwidth is however guaranteed to the Idle class, to
211 + prevent it from starving.
212 +
213 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
214 +Signed-off-by: Arianna Avanzini <avanzini@××××××.com>
215 +---
216 + block/bfq-cgroup.c | 936 +++++++++++++
217 + block/bfq-ioc.c | 36 +
218 + block/bfq-iosched.c | 3898 +++++++++++++++++++++++++++++++++++++++++++++++++++
219 + block/bfq-sched.c | 1208 ++++++++++++++++
220 + block/bfq.h | 771 ++++++++++
221 + 5 files changed, 6849 insertions(+)
222 + create mode 100644 block/bfq-cgroup.c
223 + create mode 100644 block/bfq-ioc.c
224 + create mode 100644 block/bfq-iosched.c
225 + create mode 100644 block/bfq-sched.c
226 + create mode 100644 block/bfq.h
227 +
228 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
229 +new file mode 100644
230 +index 0000000..11e2f1d
231 +--- /dev/null
232 ++++ b/block/bfq-cgroup.c
233 +@@ -0,0 +1,936 @@
234 ++/*
235 ++ * BFQ: CGROUPS support.
236 ++ *
237 ++ * Based on ideas and code from CFQ:
238 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
239 ++ *
240 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
241 ++ * Paolo Valente <paolo.valente@×××××××.it>
242 ++ *
243 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
244 ++ *
245 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
246 ++ * file.
247 ++ */
248 ++
249 ++#ifdef CONFIG_CGROUP_BFQIO
250 ++
251 ++static DEFINE_MUTEX(bfqio_mutex);
252 ++
253 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
254 ++{
255 ++ return bgrp ? !bgrp->online : false;
256 ++}
257 ++
258 ++static struct bfqio_cgroup bfqio_root_cgroup = {
259 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
260 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
261 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
262 ++};
263 ++
264 ++static inline void bfq_init_entity(struct bfq_entity *entity,
265 ++ struct bfq_group *bfqg)
266 ++{
267 ++ entity->weight = entity->new_weight;
268 ++ entity->orig_weight = entity->new_weight;
269 ++ entity->ioprio = entity->new_ioprio;
270 ++ entity->ioprio_class = entity->new_ioprio_class;
271 ++ entity->parent = bfqg->my_entity;
272 ++ entity->sched_data = &bfqg->sched_data;
273 ++}
274 ++
275 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
276 ++{
277 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
278 ++}
279 ++
280 ++/*
281 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
282 ++ * of bgrp. Must be called under rcu_read_lock().
283 ++ */
284 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
285 ++ struct bfq_data *bfqd)
286 ++{
287 ++ struct bfq_group *bfqg;
288 ++ void *key;
289 ++
290 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
291 ++ key = rcu_dereference(bfqg->bfqd);
292 ++ if (key == bfqd)
293 ++ return bfqg;
294 ++ }
295 ++
296 ++ return NULL;
297 ++}
298 ++
299 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
300 ++ struct bfq_group *bfqg)
301 ++{
302 ++ struct bfq_entity *entity = &bfqg->entity;
303 ++
304 ++ /*
305 ++ * If the weight of the entity has never been set via the sysfs
306 ++ * interface, then bgrp->weight == 0. In this case we initialize
307 ++ * the weight from the current ioprio value. Otherwise, the group
308 ++ * weight, if set, has priority over the ioprio value.
309 ++ */
310 ++ if (bgrp->weight == 0) {
311 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
312 ++ entity->new_ioprio = bgrp->ioprio;
313 ++ } else {
314 ++ if (bgrp->weight < BFQ_MIN_WEIGHT ||
315 ++ bgrp->weight > BFQ_MAX_WEIGHT) {
316 ++ printk(KERN_CRIT "bfq_group_init_entity: "
317 ++ "bgrp->weight %d\n", bgrp->weight);
318 ++ BUG();
319 ++ }
320 ++ entity->new_weight = bgrp->weight;
321 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
322 ++ }
323 ++ entity->orig_weight = entity->weight = entity->new_weight;
324 ++ entity->ioprio = entity->new_ioprio;
325 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
326 ++ entity->my_sched_data = &bfqg->sched_data;
327 ++ bfqg->active_entities = 0;
328 ++}
329 ++
330 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
331 ++ struct bfq_group *parent)
332 ++{
333 ++ struct bfq_entity *entity;
334 ++
335 ++ BUG_ON(parent == NULL);
336 ++ BUG_ON(bfqg == NULL);
337 ++
338 ++ entity = &bfqg->entity;
339 ++ entity->parent = parent->my_entity;
340 ++ entity->sched_data = &parent->sched_data;
341 ++}
342 ++
343 ++/**
344 ++ * bfq_group_chain_alloc - allocate a chain of groups.
345 ++ * @bfqd: queue descriptor.
346 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
347 ++ *
348 ++ * Allocate a chain of groups starting from the one belonging to
349 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
350 ++ * to the root has already an allocated group on @bfqd.
351 ++ */
352 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
353 ++ struct cgroup_subsys_state *css)
354 ++{
355 ++ struct bfqio_cgroup *bgrp;
356 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
357 ++
358 ++ for (; css != NULL; css = css->parent) {
359 ++ bgrp = css_to_bfqio(css);
360 ++
361 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
362 ++ if (bfqg != NULL) {
363 ++ /*
364 ++ * All the cgroups in the path from there to the
365 ++ * root must have a bfq_group for bfqd, so we don't
366 ++ * need any more allocations.
367 ++ */
368 ++ break;
369 ++ }
370 ++
371 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
372 ++ if (bfqg == NULL)
373 ++ goto cleanup;
374 ++
375 ++ bfq_group_init_entity(bgrp, bfqg);
376 ++ bfqg->my_entity = &bfqg->entity;
377 ++
378 ++ if (leaf == NULL) {
379 ++ leaf = bfqg;
380 ++ prev = leaf;
381 ++ } else {
382 ++ bfq_group_set_parent(prev, bfqg);
383 ++ /*
384 ++ * Build a list of allocated nodes using the bfqd
385 ++ * filed, that is still unused and will be
386 ++ * initialized only after the node will be
387 ++ * connected.
388 ++ */
389 ++ prev->bfqd = bfqg;
390 ++ prev = bfqg;
391 ++ }
392 ++ }
393 ++
394 ++ return leaf;
395 ++
396 ++cleanup:
397 ++ while (leaf != NULL) {
398 ++ prev = leaf;
399 ++ leaf = leaf->bfqd;
400 ++ kfree(prev);
401 ++ }
402 ++
403 ++ return NULL;
404 ++}
405 ++
406 ++/**
407 ++ * bfq_group_chain_link - link an allocated group chain to a cgroup
408 ++ * hierarchy.
409 ++ * @bfqd: the queue descriptor.
410 ++ * @css: the leaf cgroup_subsys_state to start from.
411 ++ * @leaf: the leaf group (to be associated to @cgroup).
412 ++ *
413 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
414 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
415 ++ * hierarchy that already as a group associated to @bfqd all the nodes
416 ++ * in the path to the root cgroup have one too.
417 ++ *
418 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
419 ++ * per device) while the bfqio_cgroup lock protects the list of groups
420 ++ * belonging to the same cgroup.
421 ++ */
422 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
423 ++ struct cgroup_subsys_state *css,
424 ++ struct bfq_group *leaf)
425 ++{
426 ++ struct bfqio_cgroup *bgrp;
427 ++ struct bfq_group *bfqg, *next, *prev = NULL;
428 ++ unsigned long flags;
429 ++
430 ++ assert_spin_locked(bfqd->queue->queue_lock);
431 ++
432 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
433 ++ bgrp = css_to_bfqio(css);
434 ++ next = leaf->bfqd;
435 ++
436 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
437 ++ BUG_ON(bfqg != NULL);
438 ++
439 ++ spin_lock_irqsave(&bgrp->lock, flags);
440 ++
441 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
442 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
443 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
444 ++
445 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
446 ++
447 ++ prev = leaf;
448 ++ leaf = next;
449 ++ }
450 ++
451 ++ BUG_ON(css == NULL && leaf != NULL);
452 ++ if (css != NULL && prev != NULL) {
453 ++ bgrp = css_to_bfqio(css);
454 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
455 ++ bfq_group_set_parent(prev, bfqg);
456 ++ }
457 ++}
458 ++
459 ++/**
460 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
461 ++ * @bfqd: queue descriptor.
462 ++ * @cgroup: cgroup being searched for.
463 ++ *
464 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
465 ++ * necessary. When a group is returned all the cgroups in the path
466 ++ * to the root have a group associated to @bfqd.
467 ++ *
468 ++ * If the allocation fails, return the root group: this breaks guarantees
469 ++ * but is a safe fallback. If this loss becomes a problem it can be
470 ++ * mitigated using the equivalent weight (given by the product of the
471 ++ * weights of the groups in the path from @group to the root) in the
472 ++ * root scheduler.
473 ++ *
474 ++ * We allocate all the missing nodes in the path from the leaf cgroup
475 ++ * to the root and we connect the nodes only after all the allocations
476 ++ * have been successful.
477 ++ */
478 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
479 ++ struct cgroup_subsys_state *css)
480 ++{
481 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
482 ++ struct bfq_group *bfqg;
483 ++
484 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
485 ++ if (bfqg != NULL)
486 ++ return bfqg;
487 ++
488 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
489 ++ if (bfqg != NULL)
490 ++ bfq_group_chain_link(bfqd, css, bfqg);
491 ++ else
492 ++ bfqg = bfqd->root_group;
493 ++
494 ++ return bfqg;
495 ++}
496 ++
497 ++/**
498 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
499 ++ * @bfqd: queue descriptor.
500 ++ * @bfqq: the queue to move.
501 ++ * @entity: @bfqq's entity.
502 ++ * @bfqg: the group to move to.
503 ++ *
504 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
505 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
506 ++ *
507 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
508 ++ * not disappear (by now this just means that we are called under
509 ++ * rcu_read_lock()).
510 ++ */
511 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
512 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
513 ++{
514 ++ int busy, resume;
515 ++
516 ++ busy = bfq_bfqq_busy(bfqq);
517 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
518 ++
519 ++ BUG_ON(resume && !entity->on_st);
520 ++ BUG_ON(busy && !resume && entity->on_st &&
521 ++ bfqq != bfqd->in_service_queue);
522 ++
523 ++ if (busy) {
524 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
525 ++
526 ++ if (!resume)
527 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
528 ++ else
529 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
530 ++ } else if (entity->on_st)
531 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
532 ++
533 ++ /*
534 ++ * Here we use a reference to bfqg. We don't need a refcounter
535 ++ * as the cgroup reference will not be dropped, so that its
536 ++ * destroy() callback will not be invoked.
537 ++ */
538 ++ entity->parent = bfqg->my_entity;
539 ++ entity->sched_data = &bfqg->sched_data;
540 ++
541 ++ if (busy && resume)
542 ++ bfq_activate_bfqq(bfqd, bfqq);
543 ++
544 ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
545 ++ bfq_schedule_dispatch(bfqd);
546 ++}
547 ++
548 ++/**
549 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
550 ++ * @bfqd: the queue descriptor.
551 ++ * @bic: the bic to move.
552 ++ * @cgroup: the cgroup to move to.
553 ++ *
554 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
555 ++ * has to make sure that the reference to cgroup is valid across the call.
556 ++ *
557 ++ * NOTE: an alternative approach might have been to store the current
558 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
559 ++ * time here, at the price of slightly more complex code.
560 ++ */
561 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
562 ++ struct bfq_io_cq *bic,
563 ++ struct cgroup_subsys_state *css)
564 ++{
565 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
566 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
567 ++ struct bfq_entity *entity;
568 ++ struct bfq_group *bfqg;
569 ++ struct bfqio_cgroup *bgrp;
570 ++
571 ++ bgrp = css_to_bfqio(css);
572 ++
573 ++ bfqg = bfq_find_alloc_group(bfqd, css);
574 ++ if (async_bfqq != NULL) {
575 ++ entity = &async_bfqq->entity;
576 ++
577 ++ if (entity->sched_data != &bfqg->sched_data) {
578 ++ bic_set_bfqq(bic, NULL, 0);
579 ++ bfq_log_bfqq(bfqd, async_bfqq,
580 ++ "bic_change_group: %p %d",
581 ++ async_bfqq, atomic_read(&async_bfqq->ref));
582 ++ bfq_put_queue(async_bfqq);
583 ++ }
584 ++ }
585 ++
586 ++ if (sync_bfqq != NULL) {
587 ++ entity = &sync_bfqq->entity;
588 ++ if (entity->sched_data != &bfqg->sched_data)
589 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
590 ++ }
591 ++
592 ++ return bfqg;
593 ++}
594 ++
595 ++/**
596 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
597 ++ * @bic: the bic being migrated.
598 ++ * @cgroup: the destination cgroup.
599 ++ *
600 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
601 ++ * moved into its new parent group.
602 ++ */
603 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
604 ++ struct cgroup_subsys_state *css)
605 ++{
606 ++ struct bfq_data *bfqd;
607 ++ unsigned long uninitialized_var(flags);
608 ++
609 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
610 ++ &flags);
611 ++ if (bfqd != NULL) {
612 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
613 ++ bfq_put_bfqd_unlock(bfqd, &flags);
614 ++ }
615 ++}
616 ++
617 ++/**
618 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
619 ++ * @bic: the @bic to update.
620 ++ *
621 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
622 ++ * We need this in addition to moving bics during the cgroup attach
623 ++ * phase because the task owning @bic could be at its first disk
624 ++ * access or we may end up in the root cgroup as the result of a
625 ++ * memory allocation failure and here we try to move to the right
626 ++ * group.
627 ++ *
628 ++ * Must be called under the queue lock. It is safe to use the returned
629 ++ * value even after the rcu_read_unlock() as the migration/destruction
630 ++ * paths act under the queue lock too. IOW it is impossible to race with
631 ++ * group migration/destruction and end up with an invalid group as:
632 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
633 ++ * has started execution, as current holds a reference to it,
634 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
635 ++ * migrated to a different cgroup] its attach() callback will have
636 ++ * taken care of remove all the references to the old cgroup data.
637 ++ */
638 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
639 ++{
640 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
641 ++ struct bfq_group *bfqg;
642 ++ struct cgroup_subsys_state *css;
643 ++
644 ++ BUG_ON(bfqd == NULL);
645 ++
646 ++ rcu_read_lock();
647 ++ css = task_css(current, bfqio_cgrp_id);
648 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
649 ++ rcu_read_unlock();
650 ++
651 ++ return bfqg;
652 ++}
653 ++
654 ++/**
655 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
656 ++ * @st: the service tree being flushed.
657 ++ */
658 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
659 ++{
660 ++ struct bfq_entity *entity = st->first_idle;
661 ++
662 ++ for (; entity != NULL; entity = st->first_idle)
663 ++ __bfq_deactivate_entity(entity, 0);
664 ++}
665 ++
666 ++/**
667 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
668 ++ * @bfqd: the device data structure with the root group.
669 ++ * @entity: the entity to move.
670 ++ */
671 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
672 ++ struct bfq_entity *entity)
673 ++{
674 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
675 ++
676 ++ BUG_ON(bfqq == NULL);
677 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
678 ++ return;
679 ++}
680 ++
681 ++/**
682 ++ * bfq_reparent_active_entities - move to the root group all active
683 ++ * entities.
684 ++ * @bfqd: the device data structure with the root group.
685 ++ * @bfqg: the group to move from.
686 ++ * @st: the service tree with the entities.
687 ++ *
688 ++ * Needs queue_lock to be taken and reference to be valid over the call.
689 ++ */
690 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
691 ++ struct bfq_group *bfqg,
692 ++ struct bfq_service_tree *st)
693 ++{
694 ++ struct rb_root *active = &st->active;
695 ++ struct bfq_entity *entity = NULL;
696 ++
697 ++ if (!RB_EMPTY_ROOT(&st->active))
698 ++ entity = bfq_entity_of(rb_first(active));
699 ++
700 ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
701 ++ bfq_reparent_leaf_entity(bfqd, entity);
702 ++
703 ++ if (bfqg->sched_data.in_service_entity != NULL)
704 ++ bfq_reparent_leaf_entity(bfqd,
705 ++ bfqg->sched_data.in_service_entity);
706 ++
707 ++ return;
708 ++}
709 ++
710 ++/**
711 ++ * bfq_destroy_group - destroy @bfqg.
712 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
713 ++ * @bfqg: the group being destroyed.
714 ++ *
715 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
716 ++ */
717 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
718 ++{
719 ++ struct bfq_data *bfqd;
720 ++ struct bfq_service_tree *st;
721 ++ struct bfq_entity *entity = bfqg->my_entity;
722 ++ unsigned long uninitialized_var(flags);
723 ++ int i;
724 ++
725 ++ hlist_del(&bfqg->group_node);
726 ++
727 ++ /*
728 ++ * Empty all service_trees belonging to this group before
729 ++ * deactivating the group itself.
730 ++ */
731 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
732 ++ st = bfqg->sched_data.service_tree + i;
733 ++
734 ++ /*
735 ++ * The idle tree may still contain bfq_queues belonging
736 ++ * to exited task because they never migrated to a different
737 ++ * cgroup from the one being destroyed now. No one else
738 ++ * can access them so it's safe to act without any lock.
739 ++ */
740 ++ bfq_flush_idle_tree(st);
741 ++
742 ++ /*
743 ++ * It may happen that some queues are still active
744 ++ * (busy) upon group destruction (if the corresponding
745 ++ * processes have been forced to terminate). We move
746 ++ * all the leaf entities corresponding to these queues
747 ++ * to the root_group.
748 ++ * Also, it may happen that the group has an entity
749 ++ * in service, which is disconnected from the active
750 ++ * tree: it must be moved, too.
751 ++ * There is no need to put the sync queues, as the
752 ++ * scheduler has taken no reference.
753 ++ */
754 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
755 ++ if (bfqd != NULL) {
756 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
757 ++ bfq_put_bfqd_unlock(bfqd, &flags);
758 ++ }
759 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
760 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
761 ++ }
762 ++ BUG_ON(bfqg->sched_data.next_in_service != NULL);
763 ++ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
764 ++
765 ++ /*
766 ++ * We may race with device destruction, take extra care when
767 ++ * dereferencing bfqg->bfqd.
768 ++ */
769 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
770 ++ if (bfqd != NULL) {
771 ++ hlist_del(&bfqg->bfqd_node);
772 ++ __bfq_deactivate_entity(entity, 0);
773 ++ bfq_put_async_queues(bfqd, bfqg);
774 ++ bfq_put_bfqd_unlock(bfqd, &flags);
775 ++ }
776 ++ BUG_ON(entity->tree != NULL);
777 ++
778 ++ /*
779 ++ * No need to defer the kfree() to the end of the RCU grace
780 ++ * period: we are called from the destroy() callback of our
781 ++ * cgroup, so we can be sure that no one is a) still using
782 ++ * this cgroup or b) doing lookups in it.
783 ++ */
784 ++ kfree(bfqg);
785 ++}
786 ++
787 ++static void bfq_end_wr_async(struct bfq_data *bfqd)
788 ++{
789 ++ struct hlist_node *tmp;
790 ++ struct bfq_group *bfqg;
791 ++
792 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
793 ++ bfq_end_wr_async_queues(bfqd, bfqg);
794 ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
795 ++}
796 ++
797 ++/**
798 ++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.
799 ++ * @bfqd: the device descriptor being exited.
800 ++ *
801 ++ * When the device exits we just make sure that no lookup can return
802 ++ * the now unused group structures. They will be deallocated on cgroup
803 ++ * destruction.
804 ++ */
805 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
806 ++{
807 ++ struct hlist_node *tmp;
808 ++ struct bfq_group *bfqg;
809 ++
810 ++ bfq_log(bfqd, "disconnect_groups beginning");
811 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
812 ++ hlist_del(&bfqg->bfqd_node);
813 ++
814 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
815 ++
816 ++ /*
817 ++ * Don't remove from the group hash, just set an
818 ++ * invalid key. No lookups can race with the
819 ++ * assignment as bfqd is being destroyed; this
820 ++ * implies also that new elements cannot be added
821 ++ * to the list.
822 ++ */
823 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
824 ++
825 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
826 ++ bfqg);
827 ++ bfq_put_async_queues(bfqd, bfqg);
828 ++ }
829 ++}
830 ++
831 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
832 ++{
833 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
834 ++ struct bfq_group *bfqg = bfqd->root_group;
835 ++
836 ++ bfq_put_async_queues(bfqd, bfqg);
837 ++
838 ++ spin_lock_irq(&bgrp->lock);
839 ++ hlist_del_rcu(&bfqg->group_node);
840 ++ spin_unlock_irq(&bgrp->lock);
841 ++
842 ++ /*
843 ++ * No need to synchronize_rcu() here: since the device is gone
844 ++ * there cannot be any read-side access to its root_group.
845 ++ */
846 ++ kfree(bfqg);
847 ++}
848 ++
849 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
850 ++{
851 ++ struct bfq_group *bfqg;
852 ++ struct bfqio_cgroup *bgrp;
853 ++ int i;
854 ++
855 ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
856 ++ if (bfqg == NULL)
857 ++ return NULL;
858 ++
859 ++ bfqg->entity.parent = NULL;
860 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
861 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
862 ++
863 ++ bgrp = &bfqio_root_cgroup;
864 ++ spin_lock_irq(&bgrp->lock);
865 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
866 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
867 ++ spin_unlock_irq(&bgrp->lock);
868 ++
869 ++ return bfqg;
870 ++}
871 ++
872 ++#define SHOW_FUNCTION(__VAR) \
873 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
874 ++ struct cftype *cftype) \
875 ++{ \
876 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
877 ++ u64 ret = -ENODEV; \
878 ++ \
879 ++ mutex_lock(&bfqio_mutex); \
880 ++ if (bfqio_is_removed(bgrp)) \
881 ++ goto out_unlock; \
882 ++ \
883 ++ spin_lock_irq(&bgrp->lock); \
884 ++ ret = bgrp->__VAR; \
885 ++ spin_unlock_irq(&bgrp->lock); \
886 ++ \
887 ++out_unlock: \
888 ++ mutex_unlock(&bfqio_mutex); \
889 ++ return ret; \
890 ++}
891 ++
892 ++SHOW_FUNCTION(weight);
893 ++SHOW_FUNCTION(ioprio);
894 ++SHOW_FUNCTION(ioprio_class);
895 ++#undef SHOW_FUNCTION
896 ++
897 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
898 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
899 ++ struct cftype *cftype, \
900 ++ u64 val) \
901 ++{ \
902 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
903 ++ struct bfq_group *bfqg; \
904 ++ int ret = -EINVAL; \
905 ++ \
906 ++ if (val < (__MIN) || val > (__MAX)) \
907 ++ return ret; \
908 ++ \
909 ++ ret = -ENODEV; \
910 ++ mutex_lock(&bfqio_mutex); \
911 ++ if (bfqio_is_removed(bgrp)) \
912 ++ goto out_unlock; \
913 ++ ret = 0; \
914 ++ \
915 ++ spin_lock_irq(&bgrp->lock); \
916 ++ bgrp->__VAR = (unsigned short)val; \
917 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
918 ++ /* \
919 ++ * Setting the ioprio_changed flag of the entity \
920 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
921 ++ * the value of the weight to its ioprio mapping. \
922 ++ * Set the flag only if necessary. \
923 ++ */ \
924 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
925 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
926 ++ /* \
927 ++ * Make sure that the above new value has been \
928 ++ * stored in bfqg->entity.new_##__VAR before \
929 ++ * setting the ioprio_changed flag. In fact, \
930 ++ * this flag may be read asynchronously (in \
931 ++ * critical sections protected by a different \
932 ++ * lock than that held here), and finding this \
933 ++ * flag set may cause the execution of the code \
934 ++ * for updating parameters whose value may \
935 ++ * depend also on bfqg->entity.new_##__VAR (in \
936 ++ * __bfq_entity_update_weight_prio). \
937 ++ * This barrier makes sure that the new value \
938 ++ * of bfqg->entity.new_##__VAR is correctly \
939 ++ * seen in that code. \
940 ++ */ \
941 ++ smp_wmb(); \
942 ++ bfqg->entity.ioprio_changed = 1; \
943 ++ } \
944 ++ } \
945 ++ spin_unlock_irq(&bgrp->lock); \
946 ++ \
947 ++out_unlock: \
948 ++ mutex_unlock(&bfqio_mutex); \
949 ++ return ret; \
950 ++}
951 ++
952 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
953 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
954 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
955 ++#undef STORE_FUNCTION
956 ++
957 ++static struct cftype bfqio_files[] = {
958 ++ {
959 ++ .name = "weight",
960 ++ .read_u64 = bfqio_cgroup_weight_read,
961 ++ .write_u64 = bfqio_cgroup_weight_write,
962 ++ },
963 ++ {
964 ++ .name = "ioprio",
965 ++ .read_u64 = bfqio_cgroup_ioprio_read,
966 ++ .write_u64 = bfqio_cgroup_ioprio_write,
967 ++ },
968 ++ {
969 ++ .name = "ioprio_class",
970 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
971 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
972 ++ },
973 ++ { }, /* terminate */
974 ++};
975 ++
976 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
977 ++ *parent_css)
978 ++{
979 ++ struct bfqio_cgroup *bgrp;
980 ++
981 ++ if (parent_css != NULL) {
982 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
983 ++ if (bgrp == NULL)
984 ++ return ERR_PTR(-ENOMEM);
985 ++ } else
986 ++ bgrp = &bfqio_root_cgroup;
987 ++
988 ++ spin_lock_init(&bgrp->lock);
989 ++ INIT_HLIST_HEAD(&bgrp->group_data);
990 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
991 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
992 ++
993 ++ return &bgrp->css;
994 ++}
995 ++
996 ++/*
997 ++ * We cannot support shared io contexts, as we have no means to support
998 ++ * two tasks with the same ioc in two different groups without major rework
999 ++ * of the main bic/bfqq data structures. By now we allow a task to change
1000 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
1001 ++ * behavior is that a group containing a task that forked using CLONE_IO
1002 ++ * will not be destroyed until the tasks sharing the ioc die.
1003 ++ */
1004 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
1005 ++ struct cgroup_taskset *tset)
1006 ++{
1007 ++ struct task_struct *task;
1008 ++ struct io_context *ioc;
1009 ++ int ret = 0;
1010 ++
1011 ++ cgroup_taskset_for_each(task, tset) {
1012 ++ /*
1013 ++ * task_lock() is needed to avoid races with
1014 ++ * exit_io_context()
1015 ++ */
1016 ++ task_lock(task);
1017 ++ ioc = task->io_context;
1018 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
1019 ++ /*
1020 ++ * ioc == NULL means that the task is either too
1021 ++ * young or exiting: if it has still no ioc the
1022 ++ * ioc can't be shared, if the task is exiting the
1023 ++ * attach will fail anyway, no matter what we
1024 ++ * return here.
1025 ++ */
1026 ++ ret = -EINVAL;
1027 ++ task_unlock(task);
1028 ++ if (ret)
1029 ++ break;
1030 ++ }
1031 ++
1032 ++ return ret;
1033 ++}
1034 ++
1035 ++static void bfqio_attach(struct cgroup_subsys_state *css,
1036 ++ struct cgroup_taskset *tset)
1037 ++{
1038 ++ struct task_struct *task;
1039 ++ struct io_context *ioc;
1040 ++ struct io_cq *icq;
1041 ++
1042 ++ /*
1043 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
1044 ++ * new group has not yet been tested.
1045 ++ */
1046 ++ cgroup_taskset_for_each(task, tset) {
1047 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1048 ++ if (ioc) {
1049 ++ /*
1050 ++ * Handle cgroup change here.
1051 ++ */
1052 ++ rcu_read_lock();
1053 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1054 ++ if (!strncmp(
1055 ++ icq->q->elevator->type->elevator_name,
1056 ++ "bfq", ELV_NAME_MAX))
1057 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
1058 ++ css);
1059 ++ rcu_read_unlock();
1060 ++ put_io_context(ioc);
1061 ++ }
1062 ++ }
1063 ++}
1064 ++
1065 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
1066 ++{
1067 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1068 ++ struct hlist_node *tmp;
1069 ++ struct bfq_group *bfqg;
1070 ++
1071 ++ /*
1072 ++ * Since we are destroying the cgroup, there are no more tasks
1073 ++ * referencing it, and all the RCU grace periods that may have
1074 ++ * referenced it are ended (as the destruction of the parent
1075 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1076 ++ * anything else and we don't need any synchronization.
1077 ++ */
1078 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1079 ++ bfq_destroy_group(bgrp, bfqg);
1080 ++
1081 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
1082 ++
1083 ++ kfree(bgrp);
1084 ++}
1085 ++
1086 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
1087 ++{
1088 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1089 ++
1090 ++ mutex_lock(&bfqio_mutex);
1091 ++ bgrp->online = true;
1092 ++ mutex_unlock(&bfqio_mutex);
1093 ++
1094 ++ return 0;
1095 ++}
1096 ++
1097 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
1098 ++{
1099 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1100 ++
1101 ++ mutex_lock(&bfqio_mutex);
1102 ++ bgrp->online = false;
1103 ++ mutex_unlock(&bfqio_mutex);
1104 ++}
1105 ++
1106 ++struct cgroup_subsys bfqio_cgrp_subsys = {
1107 ++ .css_alloc = bfqio_create,
1108 ++ .css_online = bfqio_css_online,
1109 ++ .css_offline = bfqio_css_offline,
1110 ++ .can_attach = bfqio_can_attach,
1111 ++ .attach = bfqio_attach,
1112 ++ .css_free = bfqio_destroy,
1113 ++ .legacy_cftypes = bfqio_files,
1114 ++};
1115 ++#else
1116 ++static inline void bfq_init_entity(struct bfq_entity *entity,
1117 ++ struct bfq_group *bfqg)
1118 ++{
1119 ++ entity->weight = entity->new_weight;
1120 ++ entity->orig_weight = entity->new_weight;
1121 ++ entity->ioprio = entity->new_ioprio;
1122 ++ entity->ioprio_class = entity->new_ioprio_class;
1123 ++ entity->sched_data = &bfqg->sched_data;
1124 ++}
1125 ++
1126 ++static inline struct bfq_group *
1127 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1128 ++{
1129 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
1130 ++ return bfqd->root_group;
1131 ++}
1132 ++
1133 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1134 ++ struct bfq_queue *bfqq,
1135 ++ struct bfq_entity *entity,
1136 ++ struct bfq_group *bfqg)
1137 ++{
1138 ++}
1139 ++
1140 ++static void bfq_end_wr_async(struct bfq_data *bfqd)
1141 ++{
1142 ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
1143 ++}
1144 ++
1145 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1146 ++{
1147 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
1148 ++}
1149 ++
1150 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
1151 ++{
1152 ++ kfree(bfqd->root_group);
1153 ++}
1154 ++
1155 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1156 ++{
1157 ++ struct bfq_group *bfqg;
1158 ++ int i;
1159 ++
1160 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1161 ++ if (bfqg == NULL)
1162 ++ return NULL;
1163 ++
1164 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1165 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1166 ++
1167 ++ return bfqg;
1168 ++}
1169 ++#endif
1170 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1171 +new file mode 100644
1172 +index 0000000..7f6b000
1173 +--- /dev/null
1174 ++++ b/block/bfq-ioc.c
1175 +@@ -0,0 +1,36 @@
1176 ++/*
1177 ++ * BFQ: I/O context handling.
1178 ++ *
1179 ++ * Based on ideas and code from CFQ:
1180 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1181 ++ *
1182 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1183 ++ * Paolo Valente <paolo.valente@×××××××.it>
1184 ++ *
1185 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1186 ++ */
1187 ++
1188 ++/**
1189 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1190 ++ * @icq: the iocontext queue.
1191 ++ */
1192 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1193 ++{
1194 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
1195 ++ return container_of(icq, struct bfq_io_cq, icq);
1196 ++}
1197 ++
1198 ++/**
1199 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1200 ++ * @bfqd: the lookup key.
1201 ++ * @ioc: the io_context of the process doing I/O.
1202 ++ *
1203 ++ * Queue lock must be held.
1204 ++ */
1205 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1206 ++ struct io_context *ioc)
1207 ++{
1208 ++ if (ioc)
1209 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1210 ++ return NULL;
1211 ++}
1212 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1213 +new file mode 100644
1214 +index 0000000..773b2ee
1215 +--- /dev/null
1216 ++++ b/block/bfq-iosched.c
1217 +@@ -0,0 +1,3898 @@
1218 ++/*
1219 ++ * Budget Fair Queueing (BFQ) disk scheduler.
1220 ++ *
1221 ++ * Based on ideas and code from CFQ:
1222 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1223 ++ *
1224 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1225 ++ * Paolo Valente <paolo.valente@×××××××.it>
1226 ++ *
1227 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1228 ++ *
1229 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
1230 ++ * file.
1231 ++ *
1232 ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on
1233 ++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1234 ++ * measured in number of sectors, to processes instead of time slices. The
1235 ++ * device is not granted to the in-service process for a given time slice,
1236 ++ * but until it has exhausted its assigned budget. This change from the time
1237 ++ * to the service domain allows BFQ to distribute the device throughput
1238 ++ * among processes as desired, without any distortion due to ZBR, workload
1239 ++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
1240 ++ * called B-WF2Q+, to schedule processes according to their budgets. More
1241 ++ * precisely, BFQ schedules queues associated to processes. Thanks to the
1242 ++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
1243 ++ * I/O-bound processes issuing sequential requests (to boost the
1244 ++ * throughput), and yet guarantee a low latency to interactive and soft
1245 ++ * real-time applications.
1246 ++ *
1247 ++ * BFQ is described in [1], where also a reference to the initial, more
1248 ++ * theoretical paper on BFQ can be found. The interested reader can find
1249 ++ * in the latter paper full details on the main algorithm, as well as
1250 ++ * formulas of the guarantees and formal proofs of all the properties.
1251 ++ * With respect to the version of BFQ presented in these papers, this
1252 ++ * implementation adds a few more heuristics, such as the one that
1253 ++ * guarantees a low latency to soft real-time applications, and a
1254 ++ * hierarchical extension based on H-WF2Q+.
1255 ++ *
1256 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1257 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1258 ++ * complexity derives from the one introduced with EEVDF in [3].
1259 ++ *
1260 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1261 ++ * with the BFQ Disk I/O Scheduler'',
1262 ++ * Proceedings of the 5th Annual International Systems and Storage
1263 ++ * Conference (SYSTOR '12), June 2012.
1264 ++ *
1265 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1266 ++ *
1267 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1268 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1269 ++ * Oct 1997.
1270 ++ *
1271 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1272 ++ *
1273 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1274 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
1275 ++ * Resource Allocation,'' technical report.
1276 ++ *
1277 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1278 ++ */
1279 ++#include <linux/module.h>
1280 ++#include <linux/slab.h>
1281 ++#include <linux/blkdev.h>
1282 ++#include <linux/cgroup.h>
1283 ++#include <linux/elevator.h>
1284 ++#include <linux/jiffies.h>
1285 ++#include <linux/rbtree.h>
1286 ++#include <linux/ioprio.h>
1287 ++#include "bfq.h"
1288 ++#include "blk.h"
1289 ++
1290 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1291 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1292 ++
1293 ++/* Maximum backwards seek, in KiB. */
1294 ++static const int bfq_back_max = 16 * 1024;
1295 ++
1296 ++/* Penalty of a backwards seek, in number of sectors. */
1297 ++static const int bfq_back_penalty = 2;
1298 ++
1299 ++/* Idling period duration, in jiffies. */
1300 ++static int bfq_slice_idle = HZ / 125;
1301 ++
1302 ++/* Default maximum budget values, in sectors and number of requests. */
1303 ++static const int bfq_default_max_budget = 16 * 1024;
1304 ++static const int bfq_max_budget_async_rq = 4;
1305 ++
1306 ++/*
1307 ++ * Async to sync throughput distribution is controlled as follows:
1308 ++ * when an async request is served, the entity is charged the number
1309 ++ * of sectors of the request, multiplied by the factor below
1310 ++ */
1311 ++static const int bfq_async_charge_factor = 10;
1312 ++
1313 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
1314 ++static const int bfq_timeout_sync = HZ / 8;
1315 ++static int bfq_timeout_async = HZ / 25;
1316 ++
1317 ++struct kmem_cache *bfq_pool;
1318 ++
1319 ++/* Below this threshold (in ms), we consider thinktime immediate. */
1320 ++#define BFQ_MIN_TT 2
1321 ++
1322 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
1323 ++#define BFQ_HW_QUEUE_THRESHOLD 4
1324 ++#define BFQ_HW_QUEUE_SAMPLES 32
1325 ++
1326 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1327 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1328 ++
1329 ++/* Min samples used for peak rate estimation (for autotuning). */
1330 ++#define BFQ_PEAK_RATE_SAMPLES 32
1331 ++
1332 ++/* Shift used for peak rate fixed precision calculations. */
1333 ++#define BFQ_RATE_SHIFT 16
1334 ++
1335 ++/*
1336 ++ * By default, BFQ computes the duration of the weight raising for
1337 ++ * interactive applications automatically, using the following formula:
1338 ++ * duration = (R / r) * T, where r is the peak rate of the device, and
1339 ++ * R and T are two reference parameters.
1340 ++ * In particular, R is the peak rate of the reference device (see below),
1341 ++ * and T is a reference time: given the systems that are likely to be
1342 ++ * installed on the reference device according to its speed class, T is
1343 ++ * about the maximum time needed, under BFQ and while reading two files in
1344 ++ * parallel, to load typical large applications on these systems.
1345 ++ * In practice, the slower/faster the device at hand is, the more/less it
1346 ++ * takes to load applications with respect to the reference device.
1347 ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
1348 ++ * applications.
1349 ++ *
1350 ++ * BFQ uses four different reference pairs (R, T), depending on:
1351 ++ * . whether the device is rotational or non-rotational;
1352 ++ * . whether the device is slow, such as old or portable HDDs, as well as
1353 ++ * SD cards, or fast, such as newer HDDs and SSDs.
1354 ++ *
1355 ++ * The device's speed class is dynamically (re)detected in
1356 ++ * bfq_update_peak_rate() every time the estimated peak rate is updated.
1357 ++ *
1358 ++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
1359 ++ * are the reference values for a slow/fast rotational device, whereas
1360 ++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
1361 ++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the
1362 ++ * thresholds used to switch between speed classes.
1363 ++ * Both the reference peak rates and the thresholds are measured in
1364 ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
1365 ++ */
1366 ++static int R_slow[2] = {1536, 10752};
1367 ++static int R_fast[2] = {17415, 34791};
1368 ++/*
1369 ++ * To improve readability, a conversion function is used to initialize the
1370 ++ * following arrays, which entails that they can be initialized only in a
1371 ++ * function.
1372 ++ */
1373 ++static int T_slow[2];
1374 ++static int T_fast[2];
1375 ++static int device_speed_thresh[2];
1376 ++
1377 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1378 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1379 ++
1380 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1381 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1382 ++
1383 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1384 ++
1385 ++#include "bfq-ioc.c"
1386 ++#include "bfq-sched.c"
1387 ++#include "bfq-cgroup.c"
1388 ++
1389 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1390 ++ IOPRIO_CLASS_IDLE)
1391 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1392 ++ IOPRIO_CLASS_RT)
1393 ++
1394 ++#define bfq_sample_valid(samples) ((samples) > 80)
1395 ++
1396 ++/*
1397 ++ * The following macro groups conditions that need to be evaluated when
1398 ++ * checking if existing queues and groups form a symmetric scenario
1399 ++ * and therefore idling can be reduced or disabled for some of the
1400 ++ * queues. See the comment to the function bfq_bfqq_must_not_expire()
1401 ++ * for further details.
1402 ++ */
1403 ++#ifdef CONFIG_CGROUP_BFQIO
1404 ++#define symmetric_scenario (!bfqd->active_numerous_groups && \
1405 ++ !bfq_differentiated_weights(bfqd))
1406 ++#else
1407 ++#define symmetric_scenario (!bfq_differentiated_weights(bfqd))
1408 ++#endif
1409 ++
1410 ++/*
1411 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1412 ++ * set (in which case it could also be a direct WRITE).
1413 ++ */
1414 ++static inline int bfq_bio_sync(struct bio *bio)
1415 ++{
1416 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1417 ++ return 1;
1418 ++
1419 ++ return 0;
1420 ++}
1421 ++
1422 ++/*
1423 ++ * Scheduler run of queue, if there are requests pending and no one in the
1424 ++ * driver that will restart queueing.
1425 ++ */
1426 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1427 ++{
1428 ++ if (bfqd->queued != 0) {
1429 ++ bfq_log(bfqd, "schedule dispatch");
1430 ++ kblockd_schedule_work(&bfqd->unplug_work);
1431 ++ }
1432 ++}
1433 ++
1434 ++/*
1435 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1436 ++ * We choose the request that is closesr to the head right now. Distance
1437 ++ * behind the head is penalized and only allowed to a certain extent.
1438 ++ */
1439 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
1440 ++ struct request *rq1,
1441 ++ struct request *rq2,
1442 ++ sector_t last)
1443 ++{
1444 ++ sector_t s1, s2, d1 = 0, d2 = 0;
1445 ++ unsigned long back_max;
1446 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1447 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1448 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1449 ++
1450 ++ if (rq1 == NULL || rq1 == rq2)
1451 ++ return rq2;
1452 ++ if (rq2 == NULL)
1453 ++ return rq1;
1454 ++
1455 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1456 ++ return rq1;
1457 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1458 ++ return rq2;
1459 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1460 ++ return rq1;
1461 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1462 ++ return rq2;
1463 ++
1464 ++ s1 = blk_rq_pos(rq1);
1465 ++ s2 = blk_rq_pos(rq2);
1466 ++
1467 ++ /*
1468 ++ * By definition, 1KiB is 2 sectors.
1469 ++ */
1470 ++ back_max = bfqd->bfq_back_max * 2;
1471 ++
1472 ++ /*
1473 ++ * Strict one way elevator _except_ in the case where we allow
1474 ++ * short backward seeks which are biased as twice the cost of a
1475 ++ * similar forward seek.
1476 ++ */
1477 ++ if (s1 >= last)
1478 ++ d1 = s1 - last;
1479 ++ else if (s1 + back_max >= last)
1480 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
1481 ++ else
1482 ++ wrap |= BFQ_RQ1_WRAP;
1483 ++
1484 ++ if (s2 >= last)
1485 ++ d2 = s2 - last;
1486 ++ else if (s2 + back_max >= last)
1487 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
1488 ++ else
1489 ++ wrap |= BFQ_RQ2_WRAP;
1490 ++
1491 ++ /* Found required data */
1492 ++
1493 ++ /*
1494 ++ * By doing switch() on the bit mask "wrap" we avoid having to
1495 ++ * check two variables for all permutations: --> faster!
1496 ++ */
1497 ++ switch (wrap) {
1498 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1499 ++ if (d1 < d2)
1500 ++ return rq1;
1501 ++ else if (d2 < d1)
1502 ++ return rq2;
1503 ++ else {
1504 ++ if (s1 >= s2)
1505 ++ return rq1;
1506 ++ else
1507 ++ return rq2;
1508 ++ }
1509 ++
1510 ++ case BFQ_RQ2_WRAP:
1511 ++ return rq1;
1512 ++ case BFQ_RQ1_WRAP:
1513 ++ return rq2;
1514 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1515 ++ default:
1516 ++ /*
1517 ++ * Since both rqs are wrapped,
1518 ++ * start with the one that's further behind head
1519 ++ * (--> only *one* back seek required),
1520 ++ * since back seek takes more time than forward.
1521 ++ */
1522 ++ if (s1 <= s2)
1523 ++ return rq1;
1524 ++ else
1525 ++ return rq2;
1526 ++ }
1527 ++}
1528 ++
1529 ++static struct bfq_queue *
1530 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1531 ++ sector_t sector, struct rb_node **ret_parent,
1532 ++ struct rb_node ***rb_link)
1533 ++{
1534 ++ struct rb_node **p, *parent;
1535 ++ struct bfq_queue *bfqq = NULL;
1536 ++
1537 ++ parent = NULL;
1538 ++ p = &root->rb_node;
1539 ++ while (*p) {
1540 ++ struct rb_node **n;
1541 ++
1542 ++ parent = *p;
1543 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1544 ++
1545 ++ /*
1546 ++ * Sort strictly based on sector. Smallest to the left,
1547 ++ * largest to the right.
1548 ++ */
1549 ++ if (sector > blk_rq_pos(bfqq->next_rq))
1550 ++ n = &(*p)->rb_right;
1551 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
1552 ++ n = &(*p)->rb_left;
1553 ++ else
1554 ++ break;
1555 ++ p = n;
1556 ++ bfqq = NULL;
1557 ++ }
1558 ++
1559 ++ *ret_parent = parent;
1560 ++ if (rb_link)
1561 ++ *rb_link = p;
1562 ++
1563 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1564 ++ (long long unsigned)sector,
1565 ++ bfqq != NULL ? bfqq->pid : 0);
1566 ++
1567 ++ return bfqq;
1568 ++}
1569 ++
1570 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1571 ++{
1572 ++ struct rb_node **p, *parent;
1573 ++ struct bfq_queue *__bfqq;
1574 ++
1575 ++ if (bfqq->pos_root != NULL) {
1576 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1577 ++ bfqq->pos_root = NULL;
1578 ++ }
1579 ++
1580 ++ if (bfq_class_idle(bfqq))
1581 ++ return;
1582 ++ if (!bfqq->next_rq)
1583 ++ return;
1584 ++
1585 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
1586 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1587 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
1588 ++ if (__bfqq == NULL) {
1589 ++ rb_link_node(&bfqq->pos_node, parent, p);
1590 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1591 ++ } else
1592 ++ bfqq->pos_root = NULL;
1593 ++}
1594 ++
1595 ++/*
1596 ++ * Tell whether there are active queues or groups with differentiated weights.
1597 ++ */
1598 ++static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
1599 ++{
1600 ++ /*
1601 ++ * For weights to differ, at least one of the trees must contain
1602 ++ * at least two nodes.
1603 ++ */
1604 ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
1605 ++ (bfqd->queue_weights_tree.rb_node->rb_left ||
1606 ++ bfqd->queue_weights_tree.rb_node->rb_right)
1607 ++#ifdef CONFIG_CGROUP_BFQIO
1608 ++ ) ||
1609 ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
1610 ++ (bfqd->group_weights_tree.rb_node->rb_left ||
1611 ++ bfqd->group_weights_tree.rb_node->rb_right)
1612 ++#endif
1613 ++ );
1614 ++}
1615 ++
1616 ++/*
1617 ++ * If the weight-counter tree passed as input contains no counter for
1618 ++ * the weight of the input entity, then add that counter; otherwise just
1619 ++ * increment the existing counter.
1620 ++ *
1621 ++ * Note that weight-counter trees contain few nodes in mostly symmetric
1622 ++ * scenarios. For example, if all queues have the same weight, then the
1623 ++ * weight-counter tree for the queues may contain at most one node.
1624 ++ * This holds even if low_latency is on, because weight-raised queues
1625 ++ * are not inserted in the tree.
1626 ++ * In most scenarios, the rate at which nodes are created/destroyed
1627 ++ * should be low too.
1628 ++ */
1629 ++static void bfq_weights_tree_add(struct bfq_data *bfqd,
1630 ++ struct bfq_entity *entity,
1631 ++ struct rb_root *root)
1632 ++{
1633 ++ struct rb_node **new = &(root->rb_node), *parent = NULL;
1634 ++
1635 ++ /*
1636 ++ * Do not insert if the entity is already associated with a
1637 ++ * counter, which happens if:
1638 ++ * 1) the entity is associated with a queue,
1639 ++ * 2) a request arrival has caused the queue to become both
1640 ++ * non-weight-raised, and hence change its weight, and
1641 ++ * backlogged; in this respect, each of the two events
1642 ++ * causes an invocation of this function,
1643 ++ * 3) this is the invocation of this function caused by the
1644 ++ * second event. This second invocation is actually useless,
1645 ++ * and we handle this fact by exiting immediately. More
1646 ++ * efficient or clearer solutions might possibly be adopted.
1647 ++ */
1648 ++ if (entity->weight_counter)
1649 ++ return;
1650 ++
1651 ++ while (*new) {
1652 ++ struct bfq_weight_counter *__counter = container_of(*new,
1653 ++ struct bfq_weight_counter,
1654 ++ weights_node);
1655 ++ parent = *new;
1656 ++
1657 ++ if (entity->weight == __counter->weight) {
1658 ++ entity->weight_counter = __counter;
1659 ++ goto inc_counter;
1660 ++ }
1661 ++ if (entity->weight < __counter->weight)
1662 ++ new = &((*new)->rb_left);
1663 ++ else
1664 ++ new = &((*new)->rb_right);
1665 ++ }
1666 ++
1667 ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
1668 ++ GFP_ATOMIC);
1669 ++ entity->weight_counter->weight = entity->weight;
1670 ++ rb_link_node(&entity->weight_counter->weights_node, parent, new);
1671 ++ rb_insert_color(&entity->weight_counter->weights_node, root);
1672 ++
1673 ++inc_counter:
1674 ++ entity->weight_counter->num_active++;
1675 ++}
1676 ++
1677 ++/*
1678 ++ * Decrement the weight counter associated with the entity, and, if the
1679 ++ * counter reaches 0, remove the counter from the tree.
1680 ++ * See the comments to the function bfq_weights_tree_add() for considerations
1681 ++ * about overhead.
1682 ++ */
1683 ++static void bfq_weights_tree_remove(struct bfq_data *bfqd,
1684 ++ struct bfq_entity *entity,
1685 ++ struct rb_root *root)
1686 ++{
1687 ++ if (!entity->weight_counter)
1688 ++ return;
1689 ++
1690 ++ BUG_ON(RB_EMPTY_ROOT(root));
1691 ++ BUG_ON(entity->weight_counter->weight != entity->weight);
1692 ++
1693 ++ BUG_ON(!entity->weight_counter->num_active);
1694 ++ entity->weight_counter->num_active--;
1695 ++ if (entity->weight_counter->num_active > 0)
1696 ++ goto reset_entity_pointer;
1697 ++
1698 ++ rb_erase(&entity->weight_counter->weights_node, root);
1699 ++ kfree(entity->weight_counter);
1700 ++
1701 ++reset_entity_pointer:
1702 ++ entity->weight_counter = NULL;
1703 ++}
1704 ++
1705 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1706 ++ struct bfq_queue *bfqq,
1707 ++ struct request *last)
1708 ++{
1709 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
1710 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
1711 ++ struct request *next = NULL, *prev = NULL;
1712 ++
1713 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1714 ++
1715 ++ if (rbprev != NULL)
1716 ++ prev = rb_entry_rq(rbprev);
1717 ++
1718 ++ if (rbnext != NULL)
1719 ++ next = rb_entry_rq(rbnext);
1720 ++ else {
1721 ++ rbnext = rb_first(&bfqq->sort_list);
1722 ++ if (rbnext && rbnext != &last->rb_node)
1723 ++ next = rb_entry_rq(rbnext);
1724 ++ }
1725 ++
1726 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1727 ++}
1728 ++
1729 ++/* see the definition of bfq_async_charge_factor for details */
1730 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
1731 ++ struct bfq_queue *bfqq)
1732 ++{
1733 ++ return blk_rq_sectors(rq) *
1734 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
1735 ++ bfq_async_charge_factor));
1736 ++}
1737 ++
1738 ++/**
1739 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
1740 ++ * @bfqd: the device data the queue belongs to.
1741 ++ * @bfqq: the queue to update.
1742 ++ *
1743 ++ * If the first request of a queue changes we make sure that the queue
1744 ++ * has enough budget to serve at least its first request (if the
1745 ++ * request has grown). We do this because if the queue has not enough
1746 ++ * budget for its first request, it has to go through two dispatch
1747 ++ * rounds to actually get it dispatched.
1748 ++ */
1749 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
1750 ++ struct bfq_queue *bfqq)
1751 ++{
1752 ++ struct bfq_entity *entity = &bfqq->entity;
1753 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1754 ++ struct request *next_rq = bfqq->next_rq;
1755 ++ unsigned long new_budget;
1756 ++
1757 ++ if (next_rq == NULL)
1758 ++ return;
1759 ++
1760 ++ if (bfqq == bfqd->in_service_queue)
1761 ++ /*
1762 ++ * In order not to break guarantees, budgets cannot be
1763 ++ * changed after an entity has been selected.
1764 ++ */
1765 ++ return;
1766 ++
1767 ++ BUG_ON(entity->tree != &st->active);
1768 ++ BUG_ON(entity == entity->sched_data->in_service_entity);
1769 ++
1770 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
1771 ++ bfq_serv_to_charge(next_rq, bfqq));
1772 ++ if (entity->budget != new_budget) {
1773 ++ entity->budget = new_budget;
1774 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
1775 ++ new_budget);
1776 ++ bfq_activate_bfqq(bfqd, bfqq);
1777 ++ }
1778 ++}
1779 ++
1780 ++static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
1781 ++{
1782 ++ u64 dur;
1783 ++
1784 ++ if (bfqd->bfq_wr_max_time > 0)
1785 ++ return bfqd->bfq_wr_max_time;
1786 ++
1787 ++ dur = bfqd->RT_prod;
1788 ++ do_div(dur, bfqd->peak_rate);
1789 ++
1790 ++ return dur;
1791 ++}
1792 ++
1793 ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
1794 ++static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
1795 ++ struct bfq_queue *bfqq)
1796 ++{
1797 ++ struct bfq_queue *item;
1798 ++ struct hlist_node *n;
1799 ++
1800 ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
1801 ++ hlist_del_init(&item->burst_list_node);
1802 ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1803 ++ bfqd->burst_size = 1;
1804 ++}
1805 ++
1806 ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
1807 ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1808 ++{
1809 ++ /* Increment burst size to take into account also bfqq */
1810 ++ bfqd->burst_size++;
1811 ++
1812 ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
1813 ++ struct bfq_queue *pos, *bfqq_item;
1814 ++ struct hlist_node *n;
1815 ++
1816 ++ /*
1817 ++ * Enough queues have been activated shortly after each
1818 ++ * other to consider this burst as large.
1819 ++ */
1820 ++ bfqd->large_burst = true;
1821 ++
1822 ++ /*
1823 ++ * We can now mark all queues in the burst list as
1824 ++ * belonging to a large burst.
1825 ++ */
1826 ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
1827 ++ burst_list_node)
1828 ++ bfq_mark_bfqq_in_large_burst(bfqq_item);
1829 ++ bfq_mark_bfqq_in_large_burst(bfqq);
1830 ++
1831 ++ /*
1832 ++ * From now on, and until the current burst finishes, any
1833 ++ * new queue being activated shortly after the last queue
1834 ++ * was inserted in the burst can be immediately marked as
1835 ++ * belonging to a large burst. So the burst list is not
1836 ++ * needed any more. Remove it.
1837 ++ */
1838 ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
1839 ++ burst_list_node)
1840 ++ hlist_del_init(&pos->burst_list_node);
1841 ++ } else /* burst not yet large: add bfqq to the burst list */
1842 ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1843 ++}
1844 ++
1845 ++/*
1846 ++ * If many queues happen to become active shortly after each other, then,
1847 ++ * to help the processes associated to these queues get their job done as
1848 ++ * soon as possible, it is usually better to not grant either weight-raising
1849 ++ * or device idling to these queues. In this comment we describe, firstly,
1850 ++ * the reasons why this fact holds, and, secondly, the next function, which
1851 ++ * implements the main steps needed to properly mark these queues so that
1852 ++ * they can then be treated in a different way.
1853 ++ *
1854 ++ * As for the terminology, we say that a queue becomes active, i.e.,
1855 ++ * switches from idle to backlogged, either when it is created (as a
1856 ++ * consequence of the arrival of an I/O request), or, if already existing,
1857 ++ * when a new request for the queue arrives while the queue is idle.
1858 ++ * Bursts of activations, i.e., activations of different queues occurring
1859 ++ * shortly after each other, are typically caused by services or applications
1860 ++ * that spawn or reactivate many parallel threads/processes. Examples are
1861 ++ * systemd during boot or git grep.
1862 ++ *
1863 ++ * These services or applications benefit mostly from a high throughput:
1864 ++ * the quicker the requests of the activated queues are cumulatively served,
1865 ++ * the sooner the target job of these queues gets completed. As a consequence,
1866 ++ * weight-raising any of these queues, which also implies idling the device
1867 ++ * for it, is almost always counterproductive: in most cases it just lowers
1868 ++ * throughput.
1869 ++ *
1870 ++ * On the other hand, a burst of activations may be also caused by the start
1871 ++ * of an application that does not consist in a lot of parallel I/O-bound
1872 ++ * threads. In fact, with a complex application, the burst may be just a
1873 ++ * consequence of the fact that several processes need to be executed to
1874 ++ * start-up the application. To start an application as quickly as possible,
1875 ++ * the best thing to do is to privilege the I/O related to the application
1876 ++ * with respect to all other I/O. Therefore, the best strategy to start as
1877 ++ * quickly as possible an application that causes a burst of activations is
1878 ++ * to weight-raise all the queues activated during the burst. This is the
1879 ++ * exact opposite of the best strategy for the other type of bursts.
1880 ++ *
1881 ++ * In the end, to take the best action for each of the two cases, the two
1882 ++ * types of bursts need to be distinguished. Fortunately, this seems
1883 ++ * relatively easy to do, by looking at the sizes of the bursts. In
1884 ++ * particular, we found a threshold such that bursts with a larger size
1885 ++ * than that threshold are apparently caused only by services or commands
1886 ++ * such as systemd or git grep. For brevity, hereafter we call just 'large'
1887 ++ * these bursts. BFQ *does not* weight-raise queues whose activations occur
1888 ++ * in a large burst. In addition, for each of these queues BFQ performs or
1889 ++ * does not perform idling depending on which choice boosts the throughput
1890 ++ * most. The exact choice depends on the device and request pattern at
1891 ++ * hand.
1892 ++ *
1893 ++ * Turning back to the next function, it implements all the steps needed
1894 ++ * to detect the occurrence of a large burst and to properly mark all the
1895 ++ * queues belonging to it (so that they can then be treated in a different
1896 ++ * way). This goal is achieved by maintaining a special "burst list" that
1897 ++ * holds, temporarily, the queues that belong to the burst in progress. The
1898 ++ * list is then used to mark these queues as belonging to a large burst if
1899 ++ * the burst does become large. The main steps are the following.
1900 ++ *
1901 ++ * . when the very first queue is activated, the queue is inserted into the
1902 ++ * list (as it could be the first queue in a possible burst)
1903 ++ *
1904 ++ * . if the current burst has not yet become large, and a queue Q that does
1905 ++ * not yet belong to the burst is activated shortly after the last time
1906 ++ * at which a new queue entered the burst list, then the function appends
1907 ++ * Q to the burst list
1908 ++ *
1909 ++ * . if, as a consequence of the previous step, the burst size reaches
1910 ++ * the large-burst threshold, then
1911 ++ *
1912 ++ * . all the queues in the burst list are marked as belonging to a
1913 ++ * large burst
1914 ++ *
1915 ++ * . the burst list is deleted; in fact, the burst list already served
1916 ++ * its purpose (keeping temporarily track of the queues in a burst,
1917 ++ * so as to be able to mark them as belonging to a large burst in the
1918 ++ * previous sub-step), and now is not needed any more
1919 ++ *
1920 ++ * . the device enters a large-burst mode
1921 ++ *
1922 ++ * . if a queue Q that does not belong to the burst is activated while
1923 ++ * the device is in large-burst mode and shortly after the last time
1924 ++ * at which a queue either entered the burst list or was marked as
1925 ++ * belonging to the current large burst, then Q is immediately marked
1926 ++ * as belonging to a large burst.
1927 ++ *
1928 ++ * . if a queue Q that does not belong to the burst is activated a while
1929 ++ * later, i.e., not shortly after, than the last time at which a queue
1930 ++ * either entered the burst list or was marked as belonging to the
1931 ++ * current large burst, then the current burst is deemed as finished and:
1932 ++ *
1933 ++ * . the large-burst mode is reset if set
1934 ++ *
1935 ++ * . the burst list is emptied
1936 ++ *
1937 ++ * . Q is inserted in the burst list, as Q may be the first queue
1938 ++ * in a possible new burst (then the burst list contains just Q
1939 ++ * after this step).
1940 ++ */
1941 ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1942 ++ bool idle_for_long_time)
1943 ++{
1944 ++ /*
1945 ++ * If bfqq happened to be activated in a burst, but has been idle
1946 ++ * for at least as long as an interactive queue, then we assume
1947 ++ * that, in the overall I/O initiated in the burst, the I/O
1948 ++ * associated to bfqq is finished. So bfqq does not need to be
1949 ++ * treated as a queue belonging to a burst anymore. Accordingly,
1950 ++ * we reset bfqq's in_large_burst flag if set, and remove bfqq
1951 ++ * from the burst list if it's there. We do not decrement instead
1952 ++ * burst_size, because the fact that bfqq does not need to belong
1953 ++ * to the burst list any more does not invalidate the fact that
1954 ++ * bfqq may have been activated during the current burst.
1955 ++ */
1956 ++ if (idle_for_long_time) {
1957 ++ hlist_del_init(&bfqq->burst_list_node);
1958 ++ bfq_clear_bfqq_in_large_burst(bfqq);
1959 ++ }
1960 ++
1961 ++ /*
1962 ++ * If bfqq is already in the burst list or is part of a large
1963 ++ * burst, then there is nothing else to do.
1964 ++ */
1965 ++ if (!hlist_unhashed(&bfqq->burst_list_node) ||
1966 ++ bfq_bfqq_in_large_burst(bfqq))
1967 ++ return;
1968 ++
1969 ++ /*
1970 ++ * If bfqq's activation happens late enough, then the current
1971 ++ * burst is finished, and related data structures must be reset.
1972 ++ *
1973 ++ * In this respect, consider the special case where bfqq is the very
1974 ++ * first queue being activated. In this case, last_ins_in_burst is
1975 ++ * not yet significant when we get here. But it is easy to verify
1976 ++ * that, whether or not the following condition is true, bfqq will
1977 ++ * end up being inserted into the burst list. In particular the
1978 ++ * list will happen to contain only bfqq. And this is exactly what
1979 ++ * has to happen, as bfqq may be the first queue in a possible
1980 ++ * burst.
1981 ++ */
1982 ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
1983 ++ bfqd->bfq_burst_interval)) {
1984 ++ bfqd->large_burst = false;
1985 ++ bfq_reset_burst_list(bfqd, bfqq);
1986 ++ return;
1987 ++ }
1988 ++
1989 ++ /*
1990 ++ * If we get here, then bfqq is being activated shortly after the
1991 ++ * last queue. So, if the current burst is also large, we can mark
1992 ++ * bfqq as belonging to this large burst immediately.
1993 ++ */
1994 ++ if (bfqd->large_burst) {
1995 ++ bfq_mark_bfqq_in_large_burst(bfqq);
1996 ++ return;
1997 ++ }
1998 ++
1999 ++ /*
2000 ++ * If we get here, then a large-burst state has not yet been
2001 ++ * reached, but bfqq is being activated shortly after the last
2002 ++ * queue. Then we add bfqq to the burst.
2003 ++ */
2004 ++ bfq_add_to_burst(bfqd, bfqq);
2005 ++}
2006 ++
2007 ++static void bfq_add_request(struct request *rq)
2008 ++{
2009 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2010 ++ struct bfq_entity *entity = &bfqq->entity;
2011 ++ struct bfq_data *bfqd = bfqq->bfqd;
2012 ++ struct request *next_rq, *prev;
2013 ++ unsigned long old_wr_coeff = bfqq->wr_coeff;
2014 ++ bool interactive = false;
2015 ++
2016 ++ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
2017 ++ bfqq->queued[rq_is_sync(rq)]++;
2018 ++ bfqd->queued++;
2019 ++
2020 ++ elv_rb_add(&bfqq->sort_list, rq);
2021 ++
2022 ++ /*
2023 ++ * Check if this request is a better next-serve candidate.
2024 ++ */
2025 ++ prev = bfqq->next_rq;
2026 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
2027 ++ BUG_ON(next_rq == NULL);
2028 ++ bfqq->next_rq = next_rq;
2029 ++
2030 ++ /*
2031 ++ * Adjust priority tree position, if next_rq changes.
2032 ++ */
2033 ++ if (prev != bfqq->next_rq)
2034 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
2035 ++
2036 ++ if (!bfq_bfqq_busy(bfqq)) {
2037 ++ bool soft_rt,
2038 ++ idle_for_long_time = time_is_before_jiffies(
2039 ++ bfqq->budget_timeout +
2040 ++ bfqd->bfq_wr_min_idle_time);
2041 ++
2042 ++ if (bfq_bfqq_sync(bfqq)) {
2043 ++ bool already_in_burst =
2044 ++ !hlist_unhashed(&bfqq->burst_list_node) ||
2045 ++ bfq_bfqq_in_large_burst(bfqq);
2046 ++ bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
2047 ++ /*
2048 ++ * If bfqq was not already in the current burst,
2049 ++ * then, at this point, bfqq either has been
2050 ++ * added to the current burst or has caused the
2051 ++ * current burst to terminate. In particular, in
2052 ++ * the second case, bfqq has become the first
2053 ++ * queue in a possible new burst.
2054 ++ * In both cases last_ins_in_burst needs to be
2055 ++ * moved forward.
2056 ++ */
2057 ++ if (!already_in_burst)
2058 ++ bfqd->last_ins_in_burst = jiffies;
2059 ++ }
2060 ++
2061 ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
2062 ++ !bfq_bfqq_in_large_burst(bfqq) &&
2063 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
2064 ++ interactive = !bfq_bfqq_in_large_burst(bfqq) &&
2065 ++ idle_for_long_time;
2066 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
2067 ++ bfq_serv_to_charge(next_rq, bfqq));
2068 ++
2069 ++ if (!bfq_bfqq_IO_bound(bfqq)) {
2070 ++ if (time_before(jiffies,
2071 ++ RQ_BIC(rq)->ttime.last_end_request +
2072 ++ bfqd->bfq_slice_idle)) {
2073 ++ bfqq->requests_within_timer++;
2074 ++ if (bfqq->requests_within_timer >=
2075 ++ bfqd->bfq_requests_within_timer)
2076 ++ bfq_mark_bfqq_IO_bound(bfqq);
2077 ++ } else
2078 ++ bfqq->requests_within_timer = 0;
2079 ++ }
2080 ++
2081 ++ if (!bfqd->low_latency)
2082 ++ goto add_bfqq_busy;
2083 ++
2084 ++ /*
2085 ++ * If the queue is not being boosted and has been idle
2086 ++ * for enough time, start a weight-raising period
2087 ++ */
2088 ++ if (old_wr_coeff == 1 && (interactive || soft_rt)) {
2089 ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
2090 ++ if (interactive)
2091 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2092 ++ else
2093 ++ bfqq->wr_cur_max_time =
2094 ++ bfqd->bfq_wr_rt_max_time;
2095 ++ bfq_log_bfqq(bfqd, bfqq,
2096 ++ "wrais starting at %lu, rais_max_time %u",
2097 ++ jiffies,
2098 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
2099 ++ } else if (old_wr_coeff > 1) {
2100 ++ if (interactive)
2101 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2102 ++ else if (bfq_bfqq_in_large_burst(bfqq) ||
2103 ++ (bfqq->wr_cur_max_time ==
2104 ++ bfqd->bfq_wr_rt_max_time &&
2105 ++ !soft_rt)) {
2106 ++ bfqq->wr_coeff = 1;
2107 ++ bfq_log_bfqq(bfqd, bfqq,
2108 ++ "wrais ending at %lu, rais_max_time %u",
2109 ++ jiffies,
2110 ++ jiffies_to_msecs(bfqq->
2111 ++ wr_cur_max_time));
2112 ++ } else if (time_before(
2113 ++ bfqq->last_wr_start_finish +
2114 ++ bfqq->wr_cur_max_time,
2115 ++ jiffies +
2116 ++ bfqd->bfq_wr_rt_max_time) &&
2117 ++ soft_rt) {
2118 ++ /*
2119 ++ *
2120 ++ * The remaining weight-raising time is lower
2121 ++ * than bfqd->bfq_wr_rt_max_time, which
2122 ++ * means that the application is enjoying
2123 ++ * weight raising either because deemed soft-
2124 ++ * rt in the near past, or because deemed
2125 ++ * interactive a long ago. In both cases,
2126 ++ * resetting now the current remaining weight-
2127 ++ * raising time for the application to the
2128 ++ * weight-raising duration for soft rt
2129 ++ * applications would not cause any latency
2130 ++ * increase for the application (as the new
2131 ++ * duration would be higher than the remaining
2132 ++ * time).
2133 ++ *
2134 ++ * In addition, the application is now meeting
2135 ++ * the requirements for being deemed soft rt.
2136 ++ * In the end we can correctly and safely
2137 ++ * (re)charge the weight-raising duration for
2138 ++ * the application with the weight-raising
2139 ++ * duration for soft rt applications.
2140 ++ *
2141 ++ * In particular, doing this recharge now, i.e.,
2142 ++ * before the weight-raising period for the
2143 ++ * application finishes, reduces the probability
2144 ++ * of the following negative scenario:
2145 ++ * 1) the weight of a soft rt application is
2146 ++ * raised at startup (as for any newly
2147 ++ * created application),
2148 ++ * 2) since the application is not interactive,
2149 ++ * at a certain time weight-raising is
2150 ++ * stopped for the application,
2151 ++ * 3) at that time the application happens to
2152 ++ * still have pending requests, and hence
2153 ++ * is destined to not have a chance to be
2154 ++ * deemed soft rt before these requests are
2155 ++ * completed (see the comments to the
2156 ++ * function bfq_bfqq_softrt_next_start()
2157 ++ * for details on soft rt detection),
2158 ++ * 4) these pending requests experience a high
2159 ++ * latency because the application is not
2160 ++ * weight-raised while they are pending.
2161 ++ */
2162 ++ bfqq->last_wr_start_finish = jiffies;
2163 ++ bfqq->wr_cur_max_time =
2164 ++ bfqd->bfq_wr_rt_max_time;
2165 ++ }
2166 ++ }
2167 ++ if (old_wr_coeff != bfqq->wr_coeff)
2168 ++ entity->ioprio_changed = 1;
2169 ++add_bfqq_busy:
2170 ++ bfqq->last_idle_bklogged = jiffies;
2171 ++ bfqq->service_from_backlogged = 0;
2172 ++ bfq_clear_bfqq_softrt_update(bfqq);
2173 ++ bfq_add_bfqq_busy(bfqd, bfqq);
2174 ++ } else {
2175 ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
2176 ++ time_is_before_jiffies(
2177 ++ bfqq->last_wr_start_finish +
2178 ++ bfqd->bfq_wr_min_inter_arr_async)) {
2179 ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
2180 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2181 ++
2182 ++ bfqd->wr_busy_queues++;
2183 ++ entity->ioprio_changed = 1;
2184 ++ bfq_log_bfqq(bfqd, bfqq,
2185 ++ "non-idle wrais starting at %lu, rais_max_time %u",
2186 ++ jiffies,
2187 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
2188 ++ }
2189 ++ if (prev != bfqq->next_rq)
2190 ++ bfq_updated_next_req(bfqd, bfqq);
2191 ++ }
2192 ++
2193 ++ if (bfqd->low_latency &&
2194 ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
2195 ++ bfqq->last_wr_start_finish = jiffies;
2196 ++}
2197 ++
2198 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
2199 ++ struct bio *bio)
2200 ++{
2201 ++ struct task_struct *tsk = current;
2202 ++ struct bfq_io_cq *bic;
2203 ++ struct bfq_queue *bfqq;
2204 ++
2205 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
2206 ++ if (bic == NULL)
2207 ++ return NULL;
2208 ++
2209 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2210 ++ if (bfqq != NULL)
2211 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
2212 ++
2213 ++ return NULL;
2214 ++}
2215 ++
2216 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
2217 ++{
2218 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2219 ++
2220 ++ bfqd->rq_in_driver++;
2221 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2222 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
2223 ++ (long long unsigned)bfqd->last_position);
2224 ++}
2225 ++
2226 ++static inline void bfq_deactivate_request(struct request_queue *q,
2227 ++ struct request *rq)
2228 ++{
2229 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2230 ++
2231 ++ BUG_ON(bfqd->rq_in_driver == 0);
2232 ++ bfqd->rq_in_driver--;
2233 ++}
2234 ++
2235 ++static void bfq_remove_request(struct request *rq)
2236 ++{
2237 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2238 ++ struct bfq_data *bfqd = bfqq->bfqd;
2239 ++ const int sync = rq_is_sync(rq);
2240 ++
2241 ++ if (bfqq->next_rq == rq) {
2242 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
2243 ++ bfq_updated_next_req(bfqd, bfqq);
2244 ++ }
2245 ++
2246 ++ if (rq->queuelist.prev != &rq->queuelist)
2247 ++ list_del_init(&rq->queuelist);
2248 ++ BUG_ON(bfqq->queued[sync] == 0);
2249 ++ bfqq->queued[sync]--;
2250 ++ bfqd->queued--;
2251 ++ elv_rb_del(&bfqq->sort_list, rq);
2252 ++
2253 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2254 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
2255 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2256 ++ /*
2257 ++ * Remove queue from request-position tree as it is empty.
2258 ++ */
2259 ++ if (bfqq->pos_root != NULL) {
2260 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
2261 ++ bfqq->pos_root = NULL;
2262 ++ }
2263 ++ }
2264 ++
2265 ++ if (rq->cmd_flags & REQ_META) {
2266 ++ BUG_ON(bfqq->meta_pending == 0);
2267 ++ bfqq->meta_pending--;
2268 ++ }
2269 ++}
2270 ++
2271 ++static int bfq_merge(struct request_queue *q, struct request **req,
2272 ++ struct bio *bio)
2273 ++{
2274 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2275 ++ struct request *__rq;
2276 ++
2277 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
2278 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
2279 ++ *req = __rq;
2280 ++ return ELEVATOR_FRONT_MERGE;
2281 ++ }
2282 ++
2283 ++ return ELEVATOR_NO_MERGE;
2284 ++}
2285 ++
2286 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
2287 ++ int type)
2288 ++{
2289 ++ if (type == ELEVATOR_FRONT_MERGE &&
2290 ++ rb_prev(&req->rb_node) &&
2291 ++ blk_rq_pos(req) <
2292 ++ blk_rq_pos(container_of(rb_prev(&req->rb_node),
2293 ++ struct request, rb_node))) {
2294 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
2295 ++ struct bfq_data *bfqd = bfqq->bfqd;
2296 ++ struct request *prev, *next_rq;
2297 ++
2298 ++ /* Reposition request in its sort_list */
2299 ++ elv_rb_del(&bfqq->sort_list, req);
2300 ++ elv_rb_add(&bfqq->sort_list, req);
2301 ++ /* Choose next request to be served for bfqq */
2302 ++ prev = bfqq->next_rq;
2303 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
2304 ++ bfqd->last_position);
2305 ++ BUG_ON(next_rq == NULL);
2306 ++ bfqq->next_rq = next_rq;
2307 ++ /*
2308 ++ * If next_rq changes, update both the queue's budget to
2309 ++ * fit the new request and the queue's position in its
2310 ++ * rq_pos_tree.
2311 ++ */
2312 ++ if (prev != bfqq->next_rq) {
2313 ++ bfq_updated_next_req(bfqd, bfqq);
2314 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
2315 ++ }
2316 ++ }
2317 ++}
2318 ++
2319 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
2320 ++ struct request *next)
2321 ++{
2322 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
2323 ++
2324 ++ /*
2325 ++ * If next and rq belong to the same bfq_queue and next is older
2326 ++ * than rq, then reposition rq in the fifo (by substituting next
2327 ++ * with rq). Otherwise, if next and rq belong to different
2328 ++ * bfq_queues, never reposition rq: in fact, we would have to
2329 ++ * reposition it with respect to next's position in its own fifo,
2330 ++ * which would most certainly be too expensive with respect to
2331 ++ * the benefits.
2332 ++ */
2333 ++ if (bfqq == next_bfqq &&
2334 ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2335 ++ time_before(next->fifo_time, rq->fifo_time)) {
2336 ++ list_del_init(&rq->queuelist);
2337 ++ list_replace_init(&next->queuelist, &rq->queuelist);
2338 ++ rq->fifo_time = next->fifo_time;
2339 ++ }
2340 ++
2341 ++ if (bfqq->next_rq == next)
2342 ++ bfqq->next_rq = rq;
2343 ++
2344 ++ bfq_remove_request(next);
2345 ++}
2346 ++
2347 ++/* Must be called with bfqq != NULL */
2348 ++static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
2349 ++{
2350 ++ BUG_ON(bfqq == NULL);
2351 ++ if (bfq_bfqq_busy(bfqq))
2352 ++ bfqq->bfqd->wr_busy_queues--;
2353 ++ bfqq->wr_coeff = 1;
2354 ++ bfqq->wr_cur_max_time = 0;
2355 ++ /* Trigger a weight change on the next activation of the queue */
2356 ++ bfqq->entity.ioprio_changed = 1;
2357 ++}
2358 ++
2359 ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
2360 ++ struct bfq_group *bfqg)
2361 ++{
2362 ++ int i, j;
2363 ++
2364 ++ for (i = 0; i < 2; i++)
2365 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
2366 ++ if (bfqg->async_bfqq[i][j] != NULL)
2367 ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
2368 ++ if (bfqg->async_idle_bfqq != NULL)
2369 ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
2370 ++}
2371 ++
2372 ++static void bfq_end_wr(struct bfq_data *bfqd)
2373 ++{
2374 ++ struct bfq_queue *bfqq;
2375 ++
2376 ++ spin_lock_irq(bfqd->queue->queue_lock);
2377 ++
2378 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
2379 ++ bfq_bfqq_end_wr(bfqq);
2380 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2381 ++ bfq_bfqq_end_wr(bfqq);
2382 ++ bfq_end_wr_async(bfqd);
2383 ++
2384 ++ spin_unlock_irq(bfqd->queue->queue_lock);
2385 ++}
2386 ++
2387 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
2388 ++ struct bio *bio)
2389 ++{
2390 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2391 ++ struct bfq_io_cq *bic;
2392 ++ struct bfq_queue *bfqq;
2393 ++
2394 ++ /*
2395 ++ * Disallow merge of a sync bio into an async request.
2396 ++ */
2397 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
2398 ++ return 0;
2399 ++
2400 ++ /*
2401 ++ * Lookup the bfqq that this bio will be queued with. Allow
2402 ++ * merge only if rq is queued there.
2403 ++ * Queue lock is held here.
2404 ++ */
2405 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
2406 ++ if (bic == NULL)
2407 ++ return 0;
2408 ++
2409 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2410 ++ return bfqq == RQ_BFQQ(rq);
2411 ++}
2412 ++
2413 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2414 ++ struct bfq_queue *bfqq)
2415 ++{
2416 ++ if (bfqq != NULL) {
2417 ++ bfq_mark_bfqq_must_alloc(bfqq);
2418 ++ bfq_mark_bfqq_budget_new(bfqq);
2419 ++ bfq_clear_bfqq_fifo_expire(bfqq);
2420 ++
2421 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
2422 ++
2423 ++ bfq_log_bfqq(bfqd, bfqq,
2424 ++ "set_in_service_queue, cur-budget = %lu",
2425 ++ bfqq->entity.budget);
2426 ++ }
2427 ++
2428 ++ bfqd->in_service_queue = bfqq;
2429 ++}
2430 ++
2431 ++/*
2432 ++ * Get and set a new queue for service.
2433 ++ */
2434 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
2435 ++ struct bfq_queue *bfqq)
2436 ++{
2437 ++ if (!bfqq)
2438 ++ bfqq = bfq_get_next_queue(bfqd);
2439 ++ else
2440 ++ bfq_get_next_queue_forced(bfqd, bfqq);
2441 ++
2442 ++ __bfq_set_in_service_queue(bfqd, bfqq);
2443 ++ return bfqq;
2444 ++}
2445 ++
2446 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
2447 ++ struct request *rq)
2448 ++{
2449 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
2450 ++ return blk_rq_pos(rq) - bfqd->last_position;
2451 ++ else
2452 ++ return bfqd->last_position - blk_rq_pos(rq);
2453 ++}
2454 ++
2455 ++/*
2456 ++ * Return true if bfqq has no request pending and rq is close enough to
2457 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
2458 ++ * bfqq->next_rq
2459 ++ */
2460 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
2461 ++{
2462 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
2463 ++}
2464 ++
2465 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
2466 ++{
2467 ++ struct rb_root *root = &bfqd->rq_pos_tree;
2468 ++ struct rb_node *parent, *node;
2469 ++ struct bfq_queue *__bfqq;
2470 ++ sector_t sector = bfqd->last_position;
2471 ++
2472 ++ if (RB_EMPTY_ROOT(root))
2473 ++ return NULL;
2474 ++
2475 ++ /*
2476 ++ * First, if we find a request starting at the end of the last
2477 ++ * request, choose it.
2478 ++ */
2479 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
2480 ++ if (__bfqq != NULL)
2481 ++ return __bfqq;
2482 ++
2483 ++ /*
2484 ++ * If the exact sector wasn't found, the parent of the NULL leaf
2485 ++ * will contain the closest sector (rq_pos_tree sorted by
2486 ++ * next_request position).
2487 ++ */
2488 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
2489 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2490 ++ return __bfqq;
2491 ++
2492 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
2493 ++ node = rb_next(&__bfqq->pos_node);
2494 ++ else
2495 ++ node = rb_prev(&__bfqq->pos_node);
2496 ++ if (node == NULL)
2497 ++ return NULL;
2498 ++
2499 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
2500 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2501 ++ return __bfqq;
2502 ++
2503 ++ return NULL;
2504 ++}
2505 ++
2506 ++/*
2507 ++ * bfqd - obvious
2508 ++ * cur_bfqq - passed in so that we don't decide that the current queue
2509 ++ * is closely cooperating with itself.
2510 ++ *
2511 ++ * We are assuming that cur_bfqq has dispatched at least one request,
2512 ++ * and that bfqd->last_position reflects a position on the disk associated
2513 ++ * with the I/O issued by cur_bfqq.
2514 ++ */
2515 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2516 ++ struct bfq_queue *cur_bfqq)
2517 ++{
2518 ++ struct bfq_queue *bfqq;
2519 ++
2520 ++ if (bfq_class_idle(cur_bfqq))
2521 ++ return NULL;
2522 ++ if (!bfq_bfqq_sync(cur_bfqq))
2523 ++ return NULL;
2524 ++ if (BFQQ_SEEKY(cur_bfqq))
2525 ++ return NULL;
2526 ++
2527 ++ /* If device has only one backlogged bfq_queue, don't search. */
2528 ++ if (bfqd->busy_queues == 1)
2529 ++ return NULL;
2530 ++
2531 ++ /*
2532 ++ * We should notice if some of the queues are cooperating, e.g.
2533 ++ * working closely on the same area of the disk. In that case,
2534 ++ * we can group them together and don't waste time idling.
2535 ++ */
2536 ++ bfqq = bfqq_close(bfqd);
2537 ++ if (bfqq == NULL || bfqq == cur_bfqq)
2538 ++ return NULL;
2539 ++
2540 ++ /*
2541 ++ * Do not merge queues from different bfq_groups.
2542 ++ */
2543 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2544 ++ return NULL;
2545 ++
2546 ++ /*
2547 ++ * It only makes sense to merge sync queues.
2548 ++ */
2549 ++ if (!bfq_bfqq_sync(bfqq))
2550 ++ return NULL;
2551 ++ if (BFQQ_SEEKY(bfqq))
2552 ++ return NULL;
2553 ++
2554 ++ /*
2555 ++ * Do not merge queues of different priority classes.
2556 ++ */
2557 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2558 ++ return NULL;
2559 ++
2560 ++ return bfqq;
2561 ++}
2562 ++
2563 ++/*
2564 ++ * If enough samples have been computed, return the current max budget
2565 ++ * stored in bfqd, which is dynamically updated according to the
2566 ++ * estimated disk peak rate; otherwise return the default max budget
2567 ++ */
2568 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2569 ++{
2570 ++ if (bfqd->budgets_assigned < 194)
2571 ++ return bfq_default_max_budget;
2572 ++ else
2573 ++ return bfqd->bfq_max_budget;
2574 ++}
2575 ++
2576 ++/*
2577 ++ * Return min budget, which is a fraction of the current or default
2578 ++ * max budget (trying with 1/32)
2579 ++ */
2580 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2581 ++{
2582 ++ if (bfqd->budgets_assigned < 194)
2583 ++ return bfq_default_max_budget / 32;
2584 ++ else
2585 ++ return bfqd->bfq_max_budget / 32;
2586 ++}
2587 ++
2588 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2589 ++{
2590 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
2591 ++ struct bfq_io_cq *bic;
2592 ++ unsigned long sl;
2593 ++
2594 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2595 ++
2596 ++ /* Processes have exited, don't wait. */
2597 ++ bic = bfqd->in_service_bic;
2598 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2599 ++ return;
2600 ++
2601 ++ bfq_mark_bfqq_wait_request(bfqq);
2602 ++
2603 ++ /*
2604 ++ * We don't want to idle for seeks, but we do want to allow
2605 ++ * fair distribution of slice time for a process doing back-to-back
2606 ++ * seeks. So allow a little bit of time for him to submit a new rq.
2607 ++ *
2608 ++ * To prevent processes with (partly) seeky workloads from
2609 ++ * being too ill-treated, grant them a small fraction of the
2610 ++ * assigned budget before reducing the waiting time to
2611 ++ * BFQ_MIN_TT. This happened to help reduce latency.
2612 ++ */
2613 ++ sl = bfqd->bfq_slice_idle;
2614 ++ /*
2615 ++ * Unless the queue is being weight-raised or the scenario is
2616 ++ * asymmetric, grant only minimum idle time if the queue either
2617 ++ * has been seeky for long enough or has already proved to be
2618 ++ * constantly seeky.
2619 ++ */
2620 ++ if (bfq_sample_valid(bfqq->seek_samples) &&
2621 ++ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
2622 ++ bfq_max_budget(bfqq->bfqd) / 8) ||
2623 ++ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
2624 ++ symmetric_scenario)
2625 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2626 ++ else if (bfqq->wr_coeff > 1)
2627 ++ sl = sl * 3;
2628 ++ bfqd->last_idling_start = ktime_get();
2629 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2630 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
2631 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2632 ++}
2633 ++
2634 ++/*
2635 ++ * Set the maximum time for the in-service queue to consume its
2636 ++ * budget. This prevents seeky processes from lowering the disk
2637 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
2638 ++ */
2639 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2640 ++{
2641 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
2642 ++ unsigned int timeout_coeff;
2643 ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
2644 ++ timeout_coeff = 1;
2645 ++ else
2646 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2647 ++
2648 ++ bfqd->last_budget_start = ktime_get();
2649 ++
2650 ++ bfq_clear_bfqq_budget_new(bfqq);
2651 ++ bfqq->budget_timeout = jiffies +
2652 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2653 ++
2654 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2655 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2656 ++ timeout_coeff));
2657 ++}
2658 ++
2659 ++/*
2660 ++ * Move request from internal lists to the request queue dispatch list.
2661 ++ */
2662 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2663 ++{
2664 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2665 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2666 ++
2667 ++ /*
2668 ++ * For consistency, the next instruction should have been executed
2669 ++ * after removing the request from the queue and dispatching it.
2670 ++ * We execute instead this instruction before bfq_remove_request()
2671 ++ * (and hence introduce a temporary inconsistency), for efficiency.
2672 ++ * In fact, in a forced_dispatch, this prevents two counters related
2673 ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq
2674 ++ * is not in service, and then to be incremented again after
2675 ++ * incrementing bfqq->dispatched.
2676 ++ */
2677 ++ bfqq->dispatched++;
2678 ++ bfq_remove_request(rq);
2679 ++ elv_dispatch_sort(q, rq);
2680 ++
2681 ++ if (bfq_bfqq_sync(bfqq))
2682 ++ bfqd->sync_flight++;
2683 ++}
2684 ++
2685 ++/*
2686 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
2687 ++ */
2688 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2689 ++{
2690 ++ struct request *rq = NULL;
2691 ++
2692 ++ if (bfq_bfqq_fifo_expire(bfqq))
2693 ++ return NULL;
2694 ++
2695 ++ bfq_mark_bfqq_fifo_expire(bfqq);
2696 ++
2697 ++ if (list_empty(&bfqq->fifo))
2698 ++ return NULL;
2699 ++
2700 ++ rq = rq_entry_fifo(bfqq->fifo.next);
2701 ++
2702 ++ if (time_before(jiffies, rq->fifo_time))
2703 ++ return NULL;
2704 ++
2705 ++ return rq;
2706 ++}
2707 ++
2708 ++/* Must be called with the queue_lock held. */
2709 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
2710 ++{
2711 ++ int process_refs, io_refs;
2712 ++
2713 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2714 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2715 ++ BUG_ON(process_refs < 0);
2716 ++ return process_refs;
2717 ++}
2718 ++
2719 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2720 ++{
2721 ++ int process_refs, new_process_refs;
2722 ++ struct bfq_queue *__bfqq;
2723 ++
2724 ++ /*
2725 ++ * If there are no process references on the new_bfqq, then it is
2726 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2727 ++ * may have dropped their last reference (not just their last process
2728 ++ * reference).
2729 ++ */
2730 ++ if (!bfqq_process_refs(new_bfqq))
2731 ++ return;
2732 ++
2733 ++ /* Avoid a circular list and skip interim queue merges. */
2734 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
2735 ++ if (__bfqq == bfqq)
2736 ++ return;
2737 ++ new_bfqq = __bfqq;
2738 ++ }
2739 ++
2740 ++ process_refs = bfqq_process_refs(bfqq);
2741 ++ new_process_refs = bfqq_process_refs(new_bfqq);
2742 ++ /*
2743 ++ * If the process for the bfqq has gone away, there is no
2744 ++ * sense in merging the queues.
2745 ++ */
2746 ++ if (process_refs == 0 || new_process_refs == 0)
2747 ++ return;
2748 ++
2749 ++ /*
2750 ++ * Merge in the direction of the lesser amount of work.
2751 ++ */
2752 ++ if (new_process_refs >= process_refs) {
2753 ++ bfqq->new_bfqq = new_bfqq;
2754 ++ atomic_add(process_refs, &new_bfqq->ref);
2755 ++ } else {
2756 ++ new_bfqq->new_bfqq = bfqq;
2757 ++ atomic_add(new_process_refs, &bfqq->ref);
2758 ++ }
2759 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2760 ++ new_bfqq->pid);
2761 ++}
2762 ++
2763 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2764 ++{
2765 ++ struct bfq_entity *entity = &bfqq->entity;
2766 ++ return entity->budget - entity->service;
2767 ++}
2768 ++
2769 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2770 ++{
2771 ++ BUG_ON(bfqq != bfqd->in_service_queue);
2772 ++
2773 ++ __bfq_bfqd_reset_in_service(bfqd);
2774 ++
2775 ++ /*
2776 ++ * If this bfqq is shared between multiple processes, check
2777 ++ * to make sure that those processes are still issuing I/Os
2778 ++ * within the mean seek distance. If not, it may be time to
2779 ++ * break the queues apart again.
2780 ++ */
2781 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2782 ++ bfq_mark_bfqq_split_coop(bfqq);
2783 ++
2784 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2785 ++ /*
2786 ++ * Overloading budget_timeout field to store the time
2787 ++ * at which the queue remains with no backlog; used by
2788 ++ * the weight-raising mechanism.
2789 ++ */
2790 ++ bfqq->budget_timeout = jiffies;
2791 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2792 ++ } else {
2793 ++ bfq_activate_bfqq(bfqd, bfqq);
2794 ++ /*
2795 ++ * Resort priority tree of potential close cooperators.
2796 ++ */
2797 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
2798 ++ }
2799 ++}
2800 ++
2801 ++/**
2802 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2803 ++ * @bfqd: device data.
2804 ++ * @bfqq: queue to update.
2805 ++ * @reason: reason for expiration.
2806 ++ *
2807 ++ * Handle the feedback on @bfqq budget. See the body for detailed
2808 ++ * comments.
2809 ++ */
2810 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2811 ++ struct bfq_queue *bfqq,
2812 ++ enum bfqq_expiration reason)
2813 ++{
2814 ++ struct request *next_rq;
2815 ++ unsigned long budget, min_budget;
2816 ++
2817 ++ budget = bfqq->max_budget;
2818 ++ min_budget = bfq_min_budget(bfqd);
2819 ++
2820 ++ BUG_ON(bfqq != bfqd->in_service_queue);
2821 ++
2822 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2823 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2824 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2825 ++ budget, bfq_min_budget(bfqd));
2826 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2827 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2828 ++
2829 ++ if (bfq_bfqq_sync(bfqq)) {
2830 ++ switch (reason) {
2831 ++ /*
2832 ++ * Caveat: in all the following cases we trade latency
2833 ++ * for throughput.
2834 ++ */
2835 ++ case BFQ_BFQQ_TOO_IDLE:
2836 ++ /*
2837 ++ * This is the only case where we may reduce
2838 ++ * the budget: if there is no request of the
2839 ++ * process still waiting for completion, then
2840 ++ * we assume (tentatively) that the timer has
2841 ++ * expired because the batch of requests of
2842 ++ * the process could have been served with a
2843 ++ * smaller budget. Hence, betting that
2844 ++ * process will behave in the same way when it
2845 ++ * becomes backlogged again, we reduce its
2846 ++ * next budget. As long as we guess right,
2847 ++ * this budget cut reduces the latency
2848 ++ * experienced by the process.
2849 ++ *
2850 ++ * However, if there are still outstanding
2851 ++ * requests, then the process may have not yet
2852 ++ * issued its next request just because it is
2853 ++ * still waiting for the completion of some of
2854 ++ * the still outstanding ones. So in this
2855 ++ * subcase we do not reduce its budget, on the
2856 ++ * contrary we increase it to possibly boost
2857 ++ * the throughput, as discussed in the
2858 ++ * comments to the BUDGET_TIMEOUT case.
2859 ++ */
2860 ++ if (bfqq->dispatched > 0) /* still outstanding reqs */
2861 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2862 ++ else {
2863 ++ if (budget > 5 * min_budget)
2864 ++ budget -= 4 * min_budget;
2865 ++ else
2866 ++ budget = min_budget;
2867 ++ }
2868 ++ break;
2869 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
2870 ++ /*
2871 ++ * We double the budget here because: 1) it
2872 ++ * gives the chance to boost the throughput if
2873 ++ * this is not a seeky process (which may have
2874 ++ * bumped into this timeout because of, e.g.,
2875 ++ * ZBR), 2) together with charge_full_budget
2876 ++ * it helps give seeky processes higher
2877 ++ * timestamps, and hence be served less
2878 ++ * frequently.
2879 ++ */
2880 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2881 ++ break;
2882 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2883 ++ /*
2884 ++ * The process still has backlog, and did not
2885 ++ * let either the budget timeout or the disk
2886 ++ * idling timeout expire. Hence it is not
2887 ++ * seeky, has a short thinktime and may be
2888 ++ * happy with a higher budget too. So
2889 ++ * definitely increase the budget of this good
2890 ++ * candidate to boost the disk throughput.
2891 ++ */
2892 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
2893 ++ break;
2894 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
2895 ++ /*
2896 ++ * Leave the budget unchanged.
2897 ++ */
2898 ++ default:
2899 ++ return;
2900 ++ }
2901 ++ } else /* async queue */
2902 ++ /* async queues get always the maximum possible budget
2903 ++ * (their ability to dispatch is limited by
2904 ++ * @bfqd->bfq_max_budget_async_rq).
2905 ++ */
2906 ++ budget = bfqd->bfq_max_budget;
2907 ++
2908 ++ bfqq->max_budget = budget;
2909 ++
2910 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2911 ++ bfqq->max_budget > bfqd->bfq_max_budget)
2912 ++ bfqq->max_budget = bfqd->bfq_max_budget;
2913 ++
2914 ++ /*
2915 ++ * Make sure that we have enough budget for the next request.
2916 ++ * Since the finish time of the bfqq must be kept in sync with
2917 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
2918 ++ * update.
2919 ++ */
2920 ++ next_rq = bfqq->next_rq;
2921 ++ if (next_rq != NULL)
2922 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2923 ++ bfq_serv_to_charge(next_rq, bfqq));
2924 ++ else
2925 ++ bfqq->entity.budget = bfqq->max_budget;
2926 ++
2927 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2928 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2929 ++ bfqq->entity.budget);
2930 ++}
2931 ++
2932 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2933 ++{
2934 ++ unsigned long max_budget;
2935 ++
2936 ++ /*
2937 ++ * The max_budget calculated when autotuning is equal to the
2938 ++ * amount of sectors transfered in timeout_sync at the
2939 ++ * estimated peak rate.
2940 ++ */
2941 ++ max_budget = (unsigned long)(peak_rate * 1000 *
2942 ++ timeout >> BFQ_RATE_SHIFT);
2943 ++
2944 ++ return max_budget;
2945 ++}
2946 ++
2947 ++/*
2948 ++ * In addition to updating the peak rate, checks whether the process
2949 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
2950 ++ * to the budget timeout, to reduce the amount of service provided to
2951 ++ * seeky processes, and hence reduce their chances to lower the
2952 ++ * throughput. See the code for more details.
2953 ++ */
2954 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2955 ++ int compensate, enum bfqq_expiration reason)
2956 ++{
2957 ++ u64 bw, usecs, expected, timeout;
2958 ++ ktime_t delta;
2959 ++ int update = 0;
2960 ++
2961 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2962 ++ return 0;
2963 ++
2964 ++ if (compensate)
2965 ++ delta = bfqd->last_idling_start;
2966 ++ else
2967 ++ delta = ktime_get();
2968 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
2969 ++ usecs = ktime_to_us(delta);
2970 ++
2971 ++ /* Don't trust short/unrealistic values. */
2972 ++ if (usecs < 100 || usecs >= LONG_MAX)
2973 ++ return 0;
2974 ++
2975 ++ /*
2976 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
2977 ++ * value to store the peak rate, in sectors per usec in fixed
2978 ++ * point math. We do so to have enough precision in the estimate
2979 ++ * and to avoid overflows.
2980 ++ */
2981 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2982 ++ do_div(bw, (unsigned long)usecs);
2983 ++
2984 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2985 ++
2986 ++ /*
2987 ++ * Use only long (> 20ms) intervals to filter out spikes for
2988 ++ * the peak rate estimation.
2989 ++ */
2990 ++ if (usecs > 20000) {
2991 ++ if (bw > bfqd->peak_rate ||
2992 ++ (!BFQQ_SEEKY(bfqq) &&
2993 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2994 ++ bfq_log(bfqd, "measured bw =%llu", bw);
2995 ++ /*
2996 ++ * To smooth oscillations use a low-pass filter with
2997 ++ * alpha=7/8, i.e.,
2998 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
2999 ++ */
3000 ++ do_div(bw, 8);
3001 ++ if (bw == 0)
3002 ++ return 0;
3003 ++ bfqd->peak_rate *= 7;
3004 ++ do_div(bfqd->peak_rate, 8);
3005 ++ bfqd->peak_rate += bw;
3006 ++ update = 1;
3007 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
3008 ++ }
3009 ++
3010 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
3011 ++
3012 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
3013 ++ bfqd->peak_rate_samples++;
3014 ++
3015 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
3016 ++ update) {
3017 ++ int dev_type = blk_queue_nonrot(bfqd->queue);
3018 ++ if (bfqd->bfq_user_max_budget == 0) {
3019 ++ bfqd->bfq_max_budget =
3020 ++ bfq_calc_max_budget(bfqd->peak_rate,
3021 ++ timeout);
3022 ++ bfq_log(bfqd, "new max_budget=%lu",
3023 ++ bfqd->bfq_max_budget);
3024 ++ }
3025 ++ if (bfqd->device_speed == BFQ_BFQD_FAST &&
3026 ++ bfqd->peak_rate < device_speed_thresh[dev_type]) {
3027 ++ bfqd->device_speed = BFQ_BFQD_SLOW;
3028 ++ bfqd->RT_prod = R_slow[dev_type] *
3029 ++ T_slow[dev_type];
3030 ++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
3031 ++ bfqd->peak_rate > device_speed_thresh[dev_type]) {
3032 ++ bfqd->device_speed = BFQ_BFQD_FAST;
3033 ++ bfqd->RT_prod = R_fast[dev_type] *
3034 ++ T_fast[dev_type];
3035 ++ }
3036 ++ }
3037 ++ }
3038 ++
3039 ++ /*
3040 ++ * If the process has been served for a too short time
3041 ++ * interval to let its possible sequential accesses prevail on
3042 ++ * the initial seek time needed to move the disk head on the
3043 ++ * first sector it requested, then give the process a chance
3044 ++ * and for the moment return false.
3045 ++ */
3046 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
3047 ++ return 0;
3048 ++
3049 ++ /*
3050 ++ * A process is considered ``slow'' (i.e., seeky, so that we
3051 ++ * cannot treat it fairly in the service domain, as it would
3052 ++ * slow down too much the other processes) if, when a slice
3053 ++ * ends for whatever reason, it has received service at a
3054 ++ * rate that would not be high enough to complete the budget
3055 ++ * before the budget timeout expiration.
3056 ++ */
3057 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
3058 ++
3059 ++ /*
3060 ++ * Caveat: processes doing IO in the slower disk zones will
3061 ++ * tend to be slow(er) even if not seeky. And the estimated
3062 ++ * peak rate will actually be an average over the disk
3063 ++ * surface. Hence, to not be too harsh with unlucky processes,
3064 ++ * we keep a budget/3 margin of safety before declaring a
3065 ++ * process slow.
3066 ++ */
3067 ++ return expected > (4 * bfqq->entity.budget) / 3;
3068 ++}
3069 ++
3070 ++/*
3071 ++ * To be deemed as soft real-time, an application must meet two
3072 ++ * requirements. First, the application must not require an average
3073 ++ * bandwidth higher than the approximate bandwidth required to playback or
3074 ++ * record a compressed high-definition video.
3075 ++ * The next function is invoked on the completion of the last request of a
3076 ++ * batch, to compute the next-start time instant, soft_rt_next_start, such
3077 ++ * that, if the next request of the application does not arrive before
3078 ++ * soft_rt_next_start, then the above requirement on the bandwidth is met.
3079 ++ *
3080 ++ * The second requirement is that the request pattern of the application is
3081 ++ * isochronous, i.e., that, after issuing a request or a batch of requests,
3082 ++ * the application stops issuing new requests until all its pending requests
3083 ++ * have been completed. After that, the application may issue a new batch,
3084 ++ * and so on.
3085 ++ * For this reason the next function is invoked to compute
3086 ++ * soft_rt_next_start only for applications that meet this requirement,
3087 ++ * whereas soft_rt_next_start is set to infinity for applications that do
3088 ++ * not.
3089 ++ *
3090 ++ * Unfortunately, even a greedy application may happen to behave in an
3091 ++ * isochronous way if the CPU load is high. In fact, the application may
3092 ++ * stop issuing requests while the CPUs are busy serving other processes,
3093 ++ * then restart, then stop again for a while, and so on. In addition, if
3094 ++ * the disk achieves a low enough throughput with the request pattern
3095 ++ * issued by the application (e.g., because the request pattern is random
3096 ++ * and/or the device is slow), then the application may meet the above
3097 ++ * bandwidth requirement too. To prevent such a greedy application to be
3098 ++ * deemed as soft real-time, a further rule is used in the computation of
3099 ++ * soft_rt_next_start: soft_rt_next_start must be higher than the current
3100 ++ * time plus the maximum time for which the arrival of a request is waited
3101 ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
3102 ++ * This filters out greedy applications, as the latter issue instead their
3103 ++ * next request as soon as possible after the last one has been completed
3104 ++ * (in contrast, when a batch of requests is completed, a soft real-time
3105 ++ * application spends some time processing data).
3106 ++ *
3107 ++ * Unfortunately, the last filter may easily generate false positives if
3108 ++ * only bfqd->bfq_slice_idle is used as a reference time interval and one
3109 ++ * or both the following cases occur:
3110 ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
3111 ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
3112 ++ * HZ=100.
3113 ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
3114 ++ * for a while, then suddenly 'jump' by several units to recover the lost
3115 ++ * increments. This seems to happen, e.g., inside virtual machines.
3116 ++ * To address this issue, we do not use as a reference time interval just
3117 ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
3118 ++ * particular we add the minimum number of jiffies for which the filter
3119 ++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
3120 ++ * machines.
3121 ++ */
3122 ++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
3123 ++ struct bfq_queue *bfqq)
3124 ++{
3125 ++ return max(bfqq->last_idle_bklogged +
3126 ++ HZ * bfqq->service_from_backlogged /
3127 ++ bfqd->bfq_wr_max_softrt_rate,
3128 ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
3129 ++}
3130 ++
3131 ++/*
3132 ++ * Return the largest-possible time instant such that, for as long as possible,
3133 ++ * the current time will be lower than this time instant according to the macro
3134 ++ * time_is_before_jiffies().
3135 ++ */
3136 ++static inline unsigned long bfq_infinity_from_now(unsigned long now)
3137 ++{
3138 ++ return now + ULONG_MAX / 2;
3139 ++}
3140 ++
3141 ++/**
3142 ++ * bfq_bfqq_expire - expire a queue.
3143 ++ * @bfqd: device owning the queue.
3144 ++ * @bfqq: the queue to expire.
3145 ++ * @compensate: if true, compensate for the time spent idling.
3146 ++ * @reason: the reason causing the expiration.
3147 ++ *
3148 ++ *
3149 ++ * If the process associated to the queue is slow (i.e., seeky), or in
3150 ++ * case of budget timeout, or, finally, if it is async, we
3151 ++ * artificially charge it an entire budget (independently of the
3152 ++ * actual service it received). As a consequence, the queue will get
3153 ++ * higher timestamps than the correct ones upon reactivation, and
3154 ++ * hence it will be rescheduled as if it had received more service
3155 ++ * than what it actually received. In the end, this class of processes
3156 ++ * will receive less service in proportion to how slowly they consume
3157 ++ * their budgets (and hence how seriously they tend to lower the
3158 ++ * throughput).
3159 ++ *
3160 ++ * In contrast, when a queue expires because it has been idling for
3161 ++ * too much or because it exhausted its budget, we do not touch the
3162 ++ * amount of service it has received. Hence when the queue will be
3163 ++ * reactivated and its timestamps updated, the latter will be in sync
3164 ++ * with the actual service received by the queue until expiration.
3165 ++ *
3166 ++ * Charging a full budget to the first type of queues and the exact
3167 ++ * service to the others has the effect of using the WF2Q+ policy to
3168 ++ * schedule the former on a timeslice basis, without violating the
3169 ++ * service domain guarantees of the latter.
3170 ++ */
3171 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
3172 ++ struct bfq_queue *bfqq,
3173 ++ int compensate,
3174 ++ enum bfqq_expiration reason)
3175 ++{
3176 ++ int slow;
3177 ++ BUG_ON(bfqq != bfqd->in_service_queue);
3178 ++
3179 ++ /* Update disk peak rate for autotuning and check whether the
3180 ++ * process is slow (see bfq_update_peak_rate).
3181 ++ */
3182 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
3183 ++
3184 ++ /*
3185 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
3186 ++ * and async queues, to favor sequential sync workloads.
3187 ++ *
3188 ++ * Processes doing I/O in the slower disk zones will tend to be
3189 ++ * slow(er) even if not seeky. Hence, since the estimated peak
3190 ++ * rate is actually an average over the disk surface, these
3191 ++ * processes may timeout just for bad luck. To avoid punishing
3192 ++ * them we do not charge a full budget to a process that
3193 ++ * succeeded in consuming at least 2/3 of its budget.
3194 ++ */
3195 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
3196 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
3197 ++ bfq_bfqq_charge_full_budget(bfqq);
3198 ++
3199 ++ bfqq->service_from_backlogged += bfqq->entity.service;
3200 ++
3201 ++ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
3202 ++ !bfq_bfqq_constantly_seeky(bfqq)) {
3203 ++ bfq_mark_bfqq_constantly_seeky(bfqq);
3204 ++ if (!blk_queue_nonrot(bfqd->queue))
3205 ++ bfqd->const_seeky_busy_in_flight_queues++;
3206 ++ }
3207 ++
3208 ++ if (reason == BFQ_BFQQ_TOO_IDLE &&
3209 ++ bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
3210 ++ bfq_clear_bfqq_IO_bound(bfqq);
3211 ++
3212 ++ if (bfqd->low_latency && bfqq->wr_coeff == 1)
3213 ++ bfqq->last_wr_start_finish = jiffies;
3214 ++
3215 ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
3216 ++ RB_EMPTY_ROOT(&bfqq->sort_list)) {
3217 ++ /*
3218 ++ * If we get here, and there are no outstanding requests,
3219 ++ * then the request pattern is isochronous (see the comments
3220 ++ * to the function bfq_bfqq_softrt_next_start()). Hence we
3221 ++ * can compute soft_rt_next_start. If, instead, the queue
3222 ++ * still has outstanding requests, then we have to wait
3223 ++ * for the completion of all the outstanding requests to
3224 ++ * discover whether the request pattern is actually
3225 ++ * isochronous.
3226 ++ */
3227 ++ if (bfqq->dispatched == 0)
3228 ++ bfqq->soft_rt_next_start =
3229 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
3230 ++ else {
3231 ++ /*
3232 ++ * The application is still waiting for the
3233 ++ * completion of one or more requests:
3234 ++ * prevent it from possibly being incorrectly
3235 ++ * deemed as soft real-time by setting its
3236 ++ * soft_rt_next_start to infinity. In fact,
3237 ++ * without this assignment, the application
3238 ++ * would be incorrectly deemed as soft
3239 ++ * real-time if:
3240 ++ * 1) it issued a new request before the
3241 ++ * completion of all its in-flight
3242 ++ * requests, and
3243 ++ * 2) at that time, its soft_rt_next_start
3244 ++ * happened to be in the past.
3245 ++ */
3246 ++ bfqq->soft_rt_next_start =
3247 ++ bfq_infinity_from_now(jiffies);
3248 ++ /*
3249 ++ * Schedule an update of soft_rt_next_start to when
3250 ++ * the task may be discovered to be isochronous.
3251 ++ */
3252 ++ bfq_mark_bfqq_softrt_update(bfqq);
3253 ++ }
3254 ++ }
3255 ++
3256 ++ bfq_log_bfqq(bfqd, bfqq,
3257 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
3258 ++ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
3259 ++
3260 ++ /*
3261 ++ * Increase, decrease or leave budget unchanged according to
3262 ++ * reason.
3263 ++ */
3264 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3265 ++ __bfq_bfqq_expire(bfqd, bfqq);
3266 ++}
3267 ++
3268 ++/*
3269 ++ * Budget timeout is not implemented through a dedicated timer, but
3270 ++ * just checked on request arrivals and completions, as well as on
3271 ++ * idle timer expirations.
3272 ++ */
3273 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
3274 ++{
3275 ++ if (bfq_bfqq_budget_new(bfqq) ||
3276 ++ time_before(jiffies, bfqq->budget_timeout))
3277 ++ return 0;
3278 ++ return 1;
3279 ++}
3280 ++
3281 ++/*
3282 ++ * If we expire a queue that is waiting for the arrival of a new
3283 ++ * request, we may prevent the fictitious timestamp back-shifting that
3284 ++ * allows the guarantees of the queue to be preserved (see [1] for
3285 ++ * this tricky aspect). Hence we return true only if this condition
3286 ++ * does not hold, or if the queue is slow enough to deserve only to be
3287 ++ * kicked off for preserving a high throughput.
3288 ++*/
3289 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
3290 ++{
3291 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
3292 ++ "may_budget_timeout: wait_request %d left %d timeout %d",
3293 ++ bfq_bfqq_wait_request(bfqq),
3294 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
3295 ++ bfq_bfqq_budget_timeout(bfqq));
3296 ++
3297 ++ return (!bfq_bfqq_wait_request(bfqq) ||
3298 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
3299 ++ &&
3300 ++ bfq_bfqq_budget_timeout(bfqq);
3301 ++}
3302 ++
3303 ++/*
3304 ++ * Device idling is allowed only for the queues for which this function
3305 ++ * returns true. For this reason, the return value of this function plays a
3306 ++ * critical role for both throughput boosting and service guarantees. The
3307 ++ * return value is computed through a logical expression. In this rather
3308 ++ * long comment, we try to briefly describe all the details and motivations
3309 ++ * behind the components of this logical expression.
3310 ++ *
3311 ++ * First, the expression is false if bfqq is not sync, or if: bfqq happened
3312 ++ * to become active during a large burst of queue activations, and the
3313 ++ * pattern of requests bfqq contains boosts the throughput if bfqq is
3314 ++ * expired. In fact, queues that became active during a large burst benefit
3315 ++ * only from throughput, as discussed in the comments to bfq_handle_burst.
3316 ++ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
3317 ++ * capable flash-based devices, whereas, on rotational devices, it boosts
3318 ++ * the throughput only if bfqq contains random requests.
3319 ++ *
3320 ++ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
3321 ++ * condition does not hold, and (c) bfqq is being weight-raised, then the
3322 ++ * expression always evaluates to true, as device idling is instrumental
3323 ++ * for preserving low-latency guarantees (see [1]). If, instead, conditions
3324 ++ * (a) and (b) do hold, but (c) does not, then the expression evaluates to
3325 ++ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
3326 ++ * (2) at least one of the following two conditions holds.
3327 ++ * The first condition is that the device is not performing NCQ, because
3328 ++ * idling the device most certainly boosts the throughput if this condition
3329 ++ * holds and bfqq is I/O-bound and has been granted a non-null idle window.
3330 ++ * The second compound condition is made of the logical AND of two components.
3331 ++ *
3332 ++ * The first component is true only if there is no weight-raised busy
3333 ++ * queue. This guarantees that the device is not idled for a sync non-
3334 ++ * weight-raised queue when there are busy weight-raised queues. The former
3335 ++ * is then expired immediately if empty. Combined with the timestamping
3336 ++ * rules of BFQ (see [1] for details), this causes sync non-weight-raised
3337 ++ * queues to get a lower number of requests served, and hence to ask for a
3338 ++ * lower number of requests from the request pool, before the busy weight-
3339 ++ * raised queues get served again.
3340 ++ *
3341 ++ * This is beneficial for the processes associated with weight-raised
3342 ++ * queues, when the request pool is saturated (e.g., in the presence of
3343 ++ * write hogs). In fact, if the processes associated with the other queues
3344 ++ * ask for requests at a lower rate, then weight-raised processes have a
3345 ++ * higher probability to get a request from the pool immediately (or at
3346 ++ * least soon) when they need one. Hence they have a higher probability to
3347 ++ * actually get a fraction of the disk throughput proportional to their
3348 ++ * high weight. This is especially true with NCQ-capable drives, which
3349 ++ * enqueue several requests in advance and further reorder internally-
3350 ++ * queued requests.
3351 ++ *
3352 ++ * In the end, mistreating non-weight-raised queues when there are busy
3353 ++ * weight-raised queues seems to mitigate starvation problems in the
3354 ++ * presence of heavy write workloads and NCQ, and hence to guarantee a
3355 ++ * higher application and system responsiveness in these hostile scenarios.
3356 ++ *
3357 ++ * If the first component of the compound condition is instead true, i.e.,
3358 ++ * there is no weight-raised busy queue, then the second component of the
3359 ++ * compound condition takes into account service-guarantee and throughput
3360 ++ * issues related to NCQ (recall that the compound condition is evaluated
3361 ++ * only if the device is detected as supporting NCQ).
3362 ++ *
3363 ++ * As for service guarantees, allowing the drive to enqueue more than one
3364 ++ * request at a time, and hence delegating de facto final scheduling
3365 ++ * decisions to the drive's internal scheduler, causes loss of control on
3366 ++ * the actual request service order. In this respect, when the drive is
3367 ++ * allowed to enqueue more than one request at a time, the service
3368 ++ * distribution enforced by the drive's internal scheduler is likely to
3369 ++ * coincide with the desired device-throughput distribution only in the
3370 ++ * following, perfectly symmetric, scenario:
3371 ++ * 1) all active queues have the same weight,
3372 ++ * 2) all active groups at the same level in the groups tree have the same
3373 ++ * weight,
3374 ++ * 3) all active groups at the same level in the groups tree have the same
3375 ++ * number of children.
3376 ++ *
3377 ++ * Even in such a scenario, sequential I/O may still receive a preferential
3378 ++ * treatment, but this is not likely to be a big issue with flash-based
3379 ++ * devices, because of their non-dramatic loss of throughput with random
3380 ++ * I/O. Things do differ with HDDs, for which additional care is taken, as
3381 ++ * explained after completing the discussion for flash-based devices.
3382 ++ *
3383 ++ * Unfortunately, keeping the necessary state for evaluating exactly the
3384 ++ * above symmetry conditions would be quite complex and time-consuming.
3385 ++ * Therefore BFQ evaluates instead the following stronger sub-conditions,
3386 ++ * for which it is much easier to maintain the needed state:
3387 ++ * 1) all active queues have the same weight,
3388 ++ * 2) all active groups have the same weight,
3389 ++ * 3) all active groups have at most one active child each.
3390 ++ * In particular, the last two conditions are always true if hierarchical
3391 ++ * support and the cgroups interface are not enabled, hence no state needs
3392 ++ * to be maintained in this case.
3393 ++ *
3394 ++ * According to the above considerations, the second component of the
3395 ++ * compound condition evaluates to true if any of the above symmetry
3396 ++ * sub-condition does not hold, or the device is not flash-based. Therefore,
3397 ++ * if also the first component is true, then idling is allowed for a sync
3398 ++ * queue. These are the only sub-conditions considered if the device is
3399 ++ * flash-based, as, for such a device, it is sensible to force idling only
3400 ++ * for service-guarantee issues. In fact, as for throughput, idling
3401 ++ * NCQ-capable flash-based devices would not boost the throughput even
3402 ++ * with sequential I/O; rather it would lower the throughput in proportion
3403 ++ * to how fast the device is. In the end, (only) if all the three
3404 ++ * sub-conditions hold and the device is flash-based, the compound
3405 ++ * condition evaluates to false and therefore no idling is performed.
3406 ++ *
3407 ++ * As already said, things change with a rotational device, where idling
3408 ++ * boosts the throughput with sequential I/O (even with NCQ). Hence, for
3409 ++ * such a device the second component of the compound condition evaluates
3410 ++ * to true also if the following additional sub-condition does not hold:
3411 ++ * the queue is constantly seeky. Unfortunately, this different behavior
3412 ++ * with respect to flash-based devices causes an additional asymmetry: if
3413 ++ * some sync queues enjoy idling and some other sync queues do not, then
3414 ++ * the latter get a low share of the device throughput, simply because the
3415 ++ * former get many requests served after being set as in service, whereas
3416 ++ * the latter do not. As a consequence, to guarantee the desired throughput
3417 ++ * distribution, on HDDs the compound expression evaluates to true (and
3418 ++ * hence device idling is performed) also if the following last symmetry
3419 ++ * condition does not hold: no other queue is benefiting from idling. Also
3420 ++ * this last condition is actually replaced with a simpler-to-maintain and
3421 ++ * stronger condition: there is no busy queue which is not constantly seeky
3422 ++ * (and hence may also benefit from idling).
3423 ++ *
3424 ++ * To sum up, when all the required symmetry and throughput-boosting
3425 ++ * sub-conditions hold, the second component of the compound condition
3426 ++ * evaluates to false, and hence no idling is performed. This helps to
3427 ++ * keep the drives' internal queues full on NCQ-capable devices, and hence
3428 ++ * to boost the throughput, without causing 'almost' any loss of service
3429 ++ * guarantees. The 'almost' follows from the fact that, if the internal
3430 ++ * queue of one such device is filled while all the sub-conditions hold,
3431 ++ * but at some point in time some sub-condition stops to hold, then it may
3432 ++ * become impossible to let requests be served in the new desired order
3433 ++ * until all the requests already queued in the device have been served.
3434 ++ */
3435 ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
3436 ++{
3437 ++ struct bfq_data *bfqd = bfqq->bfqd;
3438 ++#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
3439 ++ bfqd->busy_in_flight_queues == \
3440 ++ bfqd->const_seeky_busy_in_flight_queues)
3441 ++
3442 ++#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \
3443 ++ bfqd->hw_tag && \
3444 ++ (blk_queue_nonrot(bfqd->queue) || \
3445 ++ bfq_bfqq_constantly_seeky(bfqq)))
3446 ++
3447 ++/*
3448 ++ * Condition for expiring a non-weight-raised queue (and hence not idling
3449 ++ * the device).
3450 ++ */
3451 ++#define cond_for_expiring_non_wr (bfqd->hw_tag && \
3452 ++ (bfqd->wr_busy_queues > 0 || \
3453 ++ (blk_queue_nonrot(bfqd->queue) || \
3454 ++ cond_for_seeky_on_ncq_hdd)))
3455 ++
3456 ++ return bfq_bfqq_sync(bfqq) &&
3457 ++ !cond_for_expiring_in_burst &&
3458 ++ (bfqq->wr_coeff > 1 || !symmetric_scenario ||
3459 ++ (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
3460 ++ !cond_for_expiring_non_wr)
3461 ++ );
3462 ++}
3463 ++
3464 ++/*
3465 ++ * If the in-service queue is empty but sync, and the function
3466 ++ * bfq_bfqq_must_not_expire returns true, then:
3467 ++ * 1) the queue must remain in service and cannot be expired, and
3468 ++ * 2) the disk must be idled to wait for the possible arrival of a new
3469 ++ * request for the queue.
3470 ++ * See the comments to the function bfq_bfqq_must_not_expire for the reasons
3471 ++ * why performing device idling is the best choice to boost the throughput
3472 ++ * and preserve service guarantees when bfq_bfqq_must_not_expire itself
3473 ++ * returns true.
3474 ++ */
3475 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
3476 ++{
3477 ++ struct bfq_data *bfqd = bfqq->bfqd;
3478 ++
3479 ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
3480 ++ bfq_bfqq_must_not_expire(bfqq);
3481 ++}
3482 ++
3483 ++/*
3484 ++ * Select a queue for service. If we have a current queue in service,
3485 ++ * check whether to continue servicing it, or retrieve and set a new one.
3486 ++ */
3487 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3488 ++{
3489 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
3490 ++ struct request *next_rq;
3491 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3492 ++
3493 ++ bfqq = bfqd->in_service_queue;
3494 ++ if (bfqq == NULL)
3495 ++ goto new_queue;
3496 ++
3497 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3498 ++
3499 ++ /*
3500 ++ * If another queue has a request waiting within our mean seek
3501 ++ * distance, let it run. The expire code will check for close
3502 ++ * cooperators and put the close queue at the front of the
3503 ++ * service tree. If possible, merge the expiring queue with the
3504 ++ * new bfqq.
3505 ++ */
3506 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
3507 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
3508 ++ bfq_setup_merge(bfqq, new_bfqq);
3509 ++
3510 ++ if (bfq_may_expire_for_budg_timeout(bfqq) &&
3511 ++ !timer_pending(&bfqd->idle_slice_timer) &&
3512 ++ !bfq_bfqq_must_idle(bfqq))
3513 ++ goto expire;
3514 ++
3515 ++ next_rq = bfqq->next_rq;
3516 ++ /*
3517 ++ * If bfqq has requests queued and it has enough budget left to
3518 ++ * serve them, keep the queue, otherwise expire it.
3519 ++ */
3520 ++ if (next_rq != NULL) {
3521 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
3522 ++ bfq_bfqq_budget_left(bfqq)) {
3523 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
3524 ++ goto expire;
3525 ++ } else {
3526 ++ /*
3527 ++ * The idle timer may be pending because we may
3528 ++ * not disable disk idling even when a new request
3529 ++ * arrives.
3530 ++ */
3531 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
3532 ++ /*
3533 ++ * If we get here: 1) at least a new request
3534 ++ * has arrived but we have not disabled the
3535 ++ * timer because the request was too small,
3536 ++ * 2) then the block layer has unplugged
3537 ++ * the device, causing the dispatch to be
3538 ++ * invoked.
3539 ++ *
3540 ++ * Since the device is unplugged, now the
3541 ++ * requests are probably large enough to
3542 ++ * provide a reasonable throughput.
3543 ++ * So we disable idling.
3544 ++ */
3545 ++ bfq_clear_bfqq_wait_request(bfqq);
3546 ++ del_timer(&bfqd->idle_slice_timer);
3547 ++ }
3548 ++ if (new_bfqq == NULL)
3549 ++ goto keep_queue;
3550 ++ else
3551 ++ goto expire;
3552 ++ }
3553 ++ }
3554 ++
3555 ++ /*
3556 ++ * No requests pending. However, if the in-service queue is idling
3557 ++ * for a new request, or has requests waiting for a completion and
3558 ++ * may idle after their completion, then keep it anyway.
3559 ++ */
3560 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
3561 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
3562 ++ bfqq = NULL;
3563 ++ goto keep_queue;
3564 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
3565 ++ /*
3566 ++ * Expiring the queue because there is a close cooperator,
3567 ++ * cancel timer.
3568 ++ */
3569 ++ bfq_clear_bfqq_wait_request(bfqq);
3570 ++ del_timer(&bfqd->idle_slice_timer);
3571 ++ }
3572 ++
3573 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
3574 ++expire:
3575 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
3576 ++new_queue:
3577 ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
3578 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
3579 ++ bfqq != NULL ? bfqq->pid : 0);
3580 ++keep_queue:
3581 ++ return bfqq;
3582 ++}
3583 ++
3584 ++static void bfq_update_wr_data(struct bfq_data *bfqd,
3585 ++ struct bfq_queue *bfqq)
3586 ++{
3587 ++ if (bfqq->wr_coeff > 1) { /* queue is being boosted */
3588 ++ struct bfq_entity *entity = &bfqq->entity;
3589 ++
3590 ++ bfq_log_bfqq(bfqd, bfqq,
3591 ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
3592 ++ jiffies_to_msecs(jiffies -
3593 ++ bfqq->last_wr_start_finish),
3594 ++ jiffies_to_msecs(bfqq->wr_cur_max_time),
3595 ++ bfqq->wr_coeff,
3596 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
3597 ++
3598 ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
3599 ++ entity->orig_weight * bfqq->wr_coeff);
3600 ++ if (entity->ioprio_changed)
3601 ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
3602 ++ /*
3603 ++ * If the queue was activated in a burst, or
3604 ++ * too much time has elapsed from the beginning
3605 ++ * of this weight-raising, then end weight raising.
3606 ++ */
3607 ++ if (bfq_bfqq_in_large_burst(bfqq) ||
3608 ++ time_is_before_jiffies(bfqq->last_wr_start_finish +
3609 ++ bfqq->wr_cur_max_time)) {
3610 ++ bfqq->last_wr_start_finish = jiffies;
3611 ++ bfq_log_bfqq(bfqd, bfqq,
3612 ++ "wrais ending at %lu, rais_max_time %u",
3613 ++ bfqq->last_wr_start_finish,
3614 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
3615 ++ bfq_bfqq_end_wr(bfqq);
3616 ++ __bfq_entity_update_weight_prio(
3617 ++ bfq_entity_service_tree(entity),
3618 ++ entity);
3619 ++ }
3620 ++ }
3621 ++}
3622 ++
3623 ++/*
3624 ++ * Dispatch one request from bfqq, moving it to the request queue
3625 ++ * dispatch list.
3626 ++ */
3627 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
3628 ++ struct bfq_queue *bfqq)
3629 ++{
3630 ++ int dispatched = 0;
3631 ++ struct request *rq;
3632 ++ unsigned long service_to_charge;
3633 ++
3634 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
3635 ++
3636 ++ /* Follow expired path, else get first next available. */
3637 ++ rq = bfq_check_fifo(bfqq);
3638 ++ if (rq == NULL)
3639 ++ rq = bfqq->next_rq;
3640 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
3641 ++
3642 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
3643 ++ /*
3644 ++ * This may happen if the next rq is chosen in fifo order
3645 ++ * instead of sector order. The budget is properly
3646 ++ * dimensioned to be always sufficient to serve the next
3647 ++ * request only if it is chosen in sector order. The reason
3648 ++ * is that it would be quite inefficient and little useful
3649 ++ * to always make sure that the budget is large enough to
3650 ++ * serve even the possible next rq in fifo order.
3651 ++ * In fact, requests are seldom served in fifo order.
3652 ++ *
3653 ++ * Expire the queue for budget exhaustion, and make sure
3654 ++ * that the next act_budget is enough to serve the next
3655 ++ * request, even if it comes from the fifo expired path.
3656 ++ */
3657 ++ bfqq->next_rq = rq;
3658 ++ /*
3659 ++ * Since this dispatch is failed, make sure that
3660 ++ * a new one will be performed
3661 ++ */
3662 ++ if (!bfqd->rq_in_driver)
3663 ++ bfq_schedule_dispatch(bfqd);
3664 ++ goto expire;
3665 ++ }
3666 ++
3667 ++ /* Finally, insert request into driver dispatch list. */
3668 ++ bfq_bfqq_served(bfqq, service_to_charge);
3669 ++ bfq_dispatch_insert(bfqd->queue, rq);
3670 ++
3671 ++ bfq_update_wr_data(bfqd, bfqq);
3672 ++
3673 ++ bfq_log_bfqq(bfqd, bfqq,
3674 ++ "dispatched %u sec req (%llu), budg left %lu",
3675 ++ blk_rq_sectors(rq),
3676 ++ (long long unsigned)blk_rq_pos(rq),
3677 ++ bfq_bfqq_budget_left(bfqq));
3678 ++
3679 ++ dispatched++;
3680 ++
3681 ++ if (bfqd->in_service_bic == NULL) {
3682 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3683 ++ bfqd->in_service_bic = RQ_BIC(rq);
3684 ++ }
3685 ++
3686 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
3687 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
3688 ++ bfq_class_idle(bfqq)))
3689 ++ goto expire;
3690 ++
3691 ++ return dispatched;
3692 ++
3693 ++expire:
3694 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
3695 ++ return dispatched;
3696 ++}
3697 ++
3698 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
3699 ++{
3700 ++ int dispatched = 0;
3701 ++
3702 ++ while (bfqq->next_rq != NULL) {
3703 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
3704 ++ dispatched++;
3705 ++ }
3706 ++
3707 ++ BUG_ON(!list_empty(&bfqq->fifo));
3708 ++ return dispatched;
3709 ++}
3710 ++
3711 ++/*
3712 ++ * Drain our current requests.
3713 ++ * Used for barriers and when switching io schedulers on-the-fly.
3714 ++ */
3715 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
3716 ++{
3717 ++ struct bfq_queue *bfqq, *n;
3718 ++ struct bfq_service_tree *st;
3719 ++ int dispatched = 0;
3720 ++
3721 ++ bfqq = bfqd->in_service_queue;
3722 ++ if (bfqq != NULL)
3723 ++ __bfq_bfqq_expire(bfqd, bfqq);
3724 ++
3725 ++ /*
3726 ++ * Loop through classes, and be careful to leave the scheduler
3727 ++ * in a consistent state, as feedback mechanisms and vtime
3728 ++ * updates cannot be disabled during the process.
3729 ++ */
3730 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3731 ++ st = bfq_entity_service_tree(&bfqq->entity);
3732 ++
3733 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3734 ++ bfqq->max_budget = bfq_max_budget(bfqd);
3735 ++
3736 ++ bfq_forget_idle(st);
3737 ++ }
3738 ++
3739 ++ BUG_ON(bfqd->busy_queues != 0);
3740 ++
3741 ++ return dispatched;
3742 ++}
3743 ++
3744 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
3745 ++{
3746 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3747 ++ struct bfq_queue *bfqq;
3748 ++ int max_dispatch;
3749 ++
3750 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3751 ++ if (bfqd->busy_queues == 0)
3752 ++ return 0;
3753 ++
3754 ++ if (unlikely(force))
3755 ++ return bfq_forced_dispatch(bfqd);
3756 ++
3757 ++ bfqq = bfq_select_queue(bfqd);
3758 ++ if (bfqq == NULL)
3759 ++ return 0;
3760 ++
3761 ++ if (bfq_class_idle(bfqq))
3762 ++ max_dispatch = 1;
3763 ++
3764 ++ if (!bfq_bfqq_sync(bfqq))
3765 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
3766 ++
3767 ++ if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
3768 ++ if (bfqd->busy_queues > 1)
3769 ++ return 0;
3770 ++ if (bfqq->dispatched >= 4 * max_dispatch)
3771 ++ return 0;
3772 ++ }
3773 ++
3774 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3775 ++ return 0;
3776 ++
3777 ++ bfq_clear_bfqq_wait_request(bfqq);
3778 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3779 ++
3780 ++ if (!bfq_dispatch_request(bfqd, bfqq))
3781 ++ return 0;
3782 ++
3783 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
3784 ++ bfq_bfqq_sync(bfqq) ? "sync" : "async");
3785 ++
3786 ++ return 1;
3787 ++}
3788 ++
3789 ++/*
3790 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
3791 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
3792 ++ *
3793 ++ * Queue lock must be held here.
3794 ++ */
3795 ++static void bfq_put_queue(struct bfq_queue *bfqq)
3796 ++{
3797 ++ struct bfq_data *bfqd = bfqq->bfqd;
3798 ++
3799 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3800 ++
3801 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3802 ++ atomic_read(&bfqq->ref));
3803 ++ if (!atomic_dec_and_test(&bfqq->ref))
3804 ++ return;
3805 ++
3806 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3807 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3808 ++ BUG_ON(bfqq->entity.tree != NULL);
3809 ++ BUG_ON(bfq_bfqq_busy(bfqq));
3810 ++ BUG_ON(bfqd->in_service_queue == bfqq);
3811 ++
3812 ++ if (bfq_bfqq_sync(bfqq))
3813 ++ /*
3814 ++ * The fact that this queue is being destroyed does not
3815 ++ * invalidate the fact that this queue may have been
3816 ++ * activated during the current burst. As a consequence,
3817 ++ * although the queue does not exist anymore, and hence
3818 ++ * needs to be removed from the burst list if there,
3819 ++ * the burst size has not to be decremented.
3820 ++ */
3821 ++ hlist_del_init(&bfqq->burst_list_node);
3822 ++
3823 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3824 ++
3825 ++ kmem_cache_free(bfq_pool, bfqq);
3826 ++}
3827 ++
3828 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
3829 ++{
3830 ++ struct bfq_queue *__bfqq, *next;
3831 ++
3832 ++ /*
3833 ++ * If this queue was scheduled to merge with another queue, be
3834 ++ * sure to drop the reference taken on that queue (and others in
3835 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3836 ++ */
3837 ++ __bfqq = bfqq->new_bfqq;
3838 ++ while (__bfqq) {
3839 ++ if (__bfqq == bfqq)
3840 ++ break;
3841 ++ next = __bfqq->new_bfqq;
3842 ++ bfq_put_queue(__bfqq);
3843 ++ __bfqq = next;
3844 ++ }
3845 ++}
3846 ++
3847 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3848 ++{
3849 ++ if (bfqq == bfqd->in_service_queue) {
3850 ++ __bfq_bfqq_expire(bfqd, bfqq);
3851 ++ bfq_schedule_dispatch(bfqd);
3852 ++ }
3853 ++
3854 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3855 ++ atomic_read(&bfqq->ref));
3856 ++
3857 ++ bfq_put_cooperator(bfqq);
3858 ++
3859 ++ bfq_put_queue(bfqq);
3860 ++}
3861 ++
3862 ++static inline void bfq_init_icq(struct io_cq *icq)
3863 ++{
3864 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3865 ++
3866 ++ bic->ttime.last_end_request = jiffies;
3867 ++}
3868 ++
3869 ++static void bfq_exit_icq(struct io_cq *icq)
3870 ++{
3871 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3872 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
3873 ++
3874 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
3875 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3876 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
3877 ++ }
3878 ++
3879 ++ if (bic->bfqq[BLK_RW_SYNC]) {
3880 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3881 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
3882 ++ }
3883 ++}
3884 ++
3885 ++/*
3886 ++ * Update the entity prio values; note that the new values will not
3887 ++ * be used until the next (re)activation.
3888 ++ */
3889 ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3890 ++{
3891 ++ struct task_struct *tsk = current;
3892 ++ int ioprio_class;
3893 ++
3894 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3895 ++ switch (ioprio_class) {
3896 ++ default:
3897 ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
3898 ++ "bfq: bad prio class %d\n", ioprio_class);
3899 ++ case IOPRIO_CLASS_NONE:
3900 ++ /*
3901 ++ * No prio set, inherit CPU scheduling settings.
3902 ++ */
3903 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3904 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3905 ++ break;
3906 ++ case IOPRIO_CLASS_RT:
3907 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3908 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3909 ++ break;
3910 ++ case IOPRIO_CLASS_BE:
3911 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3912 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3913 ++ break;
3914 ++ case IOPRIO_CLASS_IDLE:
3915 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3916 ++ bfqq->entity.new_ioprio = 7;
3917 ++ bfq_clear_bfqq_idle_window(bfqq);
3918 ++ break;
3919 ++ }
3920 ++
3921 ++ if (bfqq->entity.new_ioprio < 0 ||
3922 ++ bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
3923 ++ printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
3924 ++ bfqq->entity.new_ioprio);
3925 ++ BUG();
3926 ++ }
3927 ++
3928 ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
3929 ++ bfqq->entity.ioprio_changed = 1;
3930 ++}
3931 ++
3932 ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic)
3933 ++{
3934 ++ struct bfq_data *bfqd;
3935 ++ struct bfq_queue *bfqq, *new_bfqq;
3936 ++ struct bfq_group *bfqg;
3937 ++ unsigned long uninitialized_var(flags);
3938 ++ int ioprio = bic->icq.ioc->ioprio;
3939 ++
3940 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
3941 ++ &flags);
3942 ++ /*
3943 ++ * This condition may trigger on a newly created bic, be sure to
3944 ++ * drop the lock before returning.
3945 ++ */
3946 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3947 ++ goto out;
3948 ++
3949 ++ bic->ioprio = ioprio;
3950 ++
3951 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
3952 ++ if (bfqq != NULL) {
3953 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3954 ++ sched_data);
3955 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3956 ++ GFP_ATOMIC);
3957 ++ if (new_bfqq != NULL) {
3958 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3959 ++ bfq_log_bfqq(bfqd, bfqq,
3960 ++ "check_ioprio_change: bfqq %p %d",
3961 ++ bfqq, atomic_read(&bfqq->ref));
3962 ++ bfq_put_queue(bfqq);
3963 ++ }
3964 ++ }
3965 ++
3966 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
3967 ++ if (bfqq != NULL)
3968 ++ bfq_set_next_ioprio_data(bfqq, bic);
3969 ++
3970 ++out:
3971 ++ bfq_put_bfqd_unlock(bfqd, &flags);
3972 ++}
3973 ++
3974 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3975 ++ struct bfq_io_cq *bic, pid_t pid, int is_sync)
3976 ++{
3977 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3978 ++ INIT_LIST_HEAD(&bfqq->fifo);
3979 ++ INIT_HLIST_NODE(&bfqq->burst_list_node);
3980 ++
3981 ++ atomic_set(&bfqq->ref, 0);
3982 ++ bfqq->bfqd = bfqd;
3983 ++
3984 ++ if (bic)
3985 ++ bfq_set_next_ioprio_data(bfqq, bic);
3986 ++
3987 ++ if (is_sync) {
3988 ++ if (!bfq_class_idle(bfqq))
3989 ++ bfq_mark_bfqq_idle_window(bfqq);
3990 ++ bfq_mark_bfqq_sync(bfqq);
3991 ++ }
3992 ++ bfq_mark_bfqq_IO_bound(bfqq);
3993 ++
3994 ++ /* Tentative initial value to trade off between thr and lat */
3995 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3996 ++ bfqq->pid = pid;
3997 ++
3998 ++ bfqq->wr_coeff = 1;
3999 ++ bfqq->last_wr_start_finish = 0;
4000 ++ /*
4001 ++ * Set to the value for which bfqq will not be deemed as
4002 ++ * soft rt when it becomes backlogged.
4003 ++ */
4004 ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
4005 ++}
4006 ++
4007 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
4008 ++ struct bfq_group *bfqg,
4009 ++ int is_sync,
4010 ++ struct bfq_io_cq *bic,
4011 ++ gfp_t gfp_mask)
4012 ++{
4013 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
4014 ++
4015 ++retry:
4016 ++ /* bic always exists here */
4017 ++ bfqq = bic_to_bfqq(bic, is_sync);
4018 ++
4019 ++ /*
4020 ++ * Always try a new alloc if we fall back to the OOM bfqq
4021 ++ * originally, since it should just be a temporary situation.
4022 ++ */
4023 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
4024 ++ bfqq = NULL;
4025 ++ if (new_bfqq != NULL) {
4026 ++ bfqq = new_bfqq;
4027 ++ new_bfqq = NULL;
4028 ++ } else if (gfp_mask & __GFP_WAIT) {
4029 ++ spin_unlock_irq(bfqd->queue->queue_lock);
4030 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
4031 ++ gfp_mask | __GFP_ZERO,
4032 ++ bfqd->queue->node);
4033 ++ spin_lock_irq(bfqd->queue->queue_lock);
4034 ++ if (new_bfqq != NULL)
4035 ++ goto retry;
4036 ++ } else {
4037 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
4038 ++ gfp_mask | __GFP_ZERO,
4039 ++ bfqd->queue->node);
4040 ++ }
4041 ++
4042 ++ if (bfqq != NULL) {
4043 ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
4044 ++ is_sync);
4045 ++ bfq_init_entity(&bfqq->entity, bfqg);
4046 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
4047 ++ } else {
4048 ++ bfqq = &bfqd->oom_bfqq;
4049 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
4050 ++ }
4051 ++ }
4052 ++
4053 ++ if (new_bfqq != NULL)
4054 ++ kmem_cache_free(bfq_pool, new_bfqq);
4055 ++
4056 ++ return bfqq;
4057 ++}
4058 ++
4059 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
4060 ++ struct bfq_group *bfqg,
4061 ++ int ioprio_class, int ioprio)
4062 ++{
4063 ++ switch (ioprio_class) {
4064 ++ case IOPRIO_CLASS_RT:
4065 ++ return &bfqg->async_bfqq[0][ioprio];
4066 ++ case IOPRIO_CLASS_NONE:
4067 ++ ioprio = IOPRIO_NORM;
4068 ++ /* fall through */
4069 ++ case IOPRIO_CLASS_BE:
4070 ++ return &bfqg->async_bfqq[1][ioprio];
4071 ++ case IOPRIO_CLASS_IDLE:
4072 ++ return &bfqg->async_idle_bfqq;
4073 ++ default:
4074 ++ BUG();
4075 ++ }
4076 ++}
4077 ++
4078 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
4079 ++ struct bfq_group *bfqg, int is_sync,
4080 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
4081 ++{
4082 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
4083 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
4084 ++ struct bfq_queue **async_bfqq = NULL;
4085 ++ struct bfq_queue *bfqq = NULL;
4086 ++
4087 ++ if (!is_sync) {
4088 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
4089 ++ ioprio);
4090 ++ bfqq = *async_bfqq;
4091 ++ }
4092 ++
4093 ++ if (bfqq == NULL)
4094 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
4095 ++
4096 ++ /*
4097 ++ * Pin the queue now that it's allocated, scheduler exit will
4098 ++ * prune it.
4099 ++ */
4100 ++ if (!is_sync && *async_bfqq == NULL) {
4101 ++ atomic_inc(&bfqq->ref);
4102 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
4103 ++ bfqq, atomic_read(&bfqq->ref));
4104 ++ *async_bfqq = bfqq;
4105 ++ }
4106 ++
4107 ++ atomic_inc(&bfqq->ref);
4108 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
4109 ++ atomic_read(&bfqq->ref));
4110 ++ return bfqq;
4111 ++}
4112 ++
4113 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
4114 ++ struct bfq_io_cq *bic)
4115 ++{
4116 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
4117 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
4118 ++
4119 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
4120 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
4121 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
4122 ++ bic->ttime.ttime_samples;
4123 ++}
4124 ++
4125 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
4126 ++ struct bfq_queue *bfqq,
4127 ++ struct request *rq)
4128 ++{
4129 ++ sector_t sdist;
4130 ++ u64 total;
4131 ++
4132 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
4133 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
4134 ++ else
4135 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
4136 ++
4137 ++ /*
4138 ++ * Don't allow the seek distance to get too large from the
4139 ++ * odd fragment, pagein, etc.
4140 ++ */
4141 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
4142 ++ sdist = 0;
4143 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
4144 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
4145 ++ else
4146 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
4147 ++
4148 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
4149 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
4150 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
4151 ++ do_div(total, bfqq->seek_samples);
4152 ++ bfqq->seek_mean = (sector_t)total;
4153 ++
4154 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
4155 ++ (u64)bfqq->seek_mean);
4156 ++}
4157 ++
4158 ++/*
4159 ++ * Disable idle window if the process thinks too long or seeks so much that
4160 ++ * it doesn't matter.
4161 ++ */
4162 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
4163 ++ struct bfq_queue *bfqq,
4164 ++ struct bfq_io_cq *bic)
4165 ++{
4166 ++ int enable_idle;
4167 ++
4168 ++ /* Don't idle for async or idle io prio class. */
4169 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
4170 ++ return;
4171 ++
4172 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
4173 ++
4174 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
4175 ++ bfqd->bfq_slice_idle == 0 ||
4176 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
4177 ++ bfqq->wr_coeff == 1))
4178 ++ enable_idle = 0;
4179 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
4180 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
4181 ++ bfqq->wr_coeff == 1)
4182 ++ enable_idle = 0;
4183 ++ else
4184 ++ enable_idle = 1;
4185 ++ }
4186 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
4187 ++ enable_idle);
4188 ++
4189 ++ if (enable_idle)
4190 ++ bfq_mark_bfqq_idle_window(bfqq);
4191 ++ else
4192 ++ bfq_clear_bfqq_idle_window(bfqq);
4193 ++}
4194 ++
4195 ++/*
4196 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
4197 ++ * something we should do about it.
4198 ++ */
4199 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4200 ++ struct request *rq)
4201 ++{
4202 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
4203 ++
4204 ++ if (rq->cmd_flags & REQ_META)
4205 ++ bfqq->meta_pending++;
4206 ++
4207 ++ bfq_update_io_thinktime(bfqd, bic);
4208 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
4209 ++ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
4210 ++ bfq_clear_bfqq_constantly_seeky(bfqq);
4211 ++ if (!blk_queue_nonrot(bfqd->queue)) {
4212 ++ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
4213 ++ bfqd->const_seeky_busy_in_flight_queues--;
4214 ++ }
4215 ++ }
4216 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
4217 ++ !BFQQ_SEEKY(bfqq))
4218 ++ bfq_update_idle_window(bfqd, bfqq, bic);
4219 ++
4220 ++ bfq_log_bfqq(bfqd, bfqq,
4221 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
4222 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
4223 ++ (long long unsigned)bfqq->seek_mean);
4224 ++
4225 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4226 ++
4227 ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
4228 ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
4229 ++ blk_rq_sectors(rq) < 32;
4230 ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
4231 ++
4232 ++ /*
4233 ++ * There is just this request queued: if the request
4234 ++ * is small and the queue is not to be expired, then
4235 ++ * just exit.
4236 ++ *
4237 ++ * In this way, if the disk is being idled to wait for
4238 ++ * a new request from the in-service queue, we avoid
4239 ++ * unplugging the device and committing the disk to serve
4240 ++ * just a small request. On the contrary, we wait for
4241 ++ * the block layer to decide when to unplug the device:
4242 ++ * hopefully, new requests will be merged to this one
4243 ++ * quickly, then the device will be unplugged and
4244 ++ * larger requests will be dispatched.
4245 ++ */
4246 ++ if (small_req && !budget_timeout)
4247 ++ return;
4248 ++
4249 ++ /*
4250 ++ * A large enough request arrived, or the queue is to
4251 ++ * be expired: in both cases disk idling is to be
4252 ++ * stopped, so clear wait_request flag and reset
4253 ++ * timer.
4254 ++ */
4255 ++ bfq_clear_bfqq_wait_request(bfqq);
4256 ++ del_timer(&bfqd->idle_slice_timer);
4257 ++
4258 ++ /*
4259 ++ * The queue is not empty, because a new request just
4260 ++ * arrived. Hence we can safely expire the queue, in
4261 ++ * case of budget timeout, without risking that the
4262 ++ * timestamps of the queue are not updated correctly.
4263 ++ * See [1] for more details.
4264 ++ */
4265 ++ if (budget_timeout)
4266 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
4267 ++
4268 ++ /*
4269 ++ * Let the request rip immediately, or let a new queue be
4270 ++ * selected if bfqq has just been expired.
4271 ++ */
4272 ++ __blk_run_queue(bfqd->queue);
4273 ++ }
4274 ++}
4275 ++
4276 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
4277 ++{
4278 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
4279 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
4280 ++
4281 ++ assert_spin_locked(bfqd->queue->queue_lock);
4282 ++
4283 ++ bfq_add_request(rq);
4284 ++
4285 ++ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
4286 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
4287 ++
4288 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
4289 ++}
4290 ++
4291 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
4292 ++{
4293 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
4294 ++ bfqd->rq_in_driver);
4295 ++
4296 ++ if (bfqd->hw_tag == 1)
4297 ++ return;
4298 ++
4299 ++ /*
4300 ++ * This sample is valid if the number of outstanding requests
4301 ++ * is large enough to allow a queueing behavior. Note that the
4302 ++ * sum is not exact, as it's not taking into account deactivated
4303 ++ * requests.
4304 ++ */
4305 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
4306 ++ return;
4307 ++
4308 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
4309 ++ return;
4310 ++
4311 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
4312 ++ bfqd->max_rq_in_driver = 0;
4313 ++ bfqd->hw_tag_samples = 0;
4314 ++}
4315 ++
4316 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
4317 ++{
4318 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
4319 ++ struct bfq_data *bfqd = bfqq->bfqd;
4320 ++ bool sync = bfq_bfqq_sync(bfqq);
4321 ++
4322 ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
4323 ++ blk_rq_sectors(rq), sync);
4324 ++
4325 ++ bfq_update_hw_tag(bfqd);
4326 ++
4327 ++ BUG_ON(!bfqd->rq_in_driver);
4328 ++ BUG_ON(!bfqq->dispatched);
4329 ++ bfqd->rq_in_driver--;
4330 ++ bfqq->dispatched--;
4331 ++
4332 ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
4333 ++ bfq_weights_tree_remove(bfqd, &bfqq->entity,
4334 ++ &bfqd->queue_weights_tree);
4335 ++ if (!blk_queue_nonrot(bfqd->queue)) {
4336 ++ BUG_ON(!bfqd->busy_in_flight_queues);
4337 ++ bfqd->busy_in_flight_queues--;
4338 ++ if (bfq_bfqq_constantly_seeky(bfqq)) {
4339 ++ BUG_ON(!bfqd->
4340 ++ const_seeky_busy_in_flight_queues);
4341 ++ bfqd->const_seeky_busy_in_flight_queues--;
4342 ++ }
4343 ++ }
4344 ++ }
4345 ++
4346 ++ if (sync) {
4347 ++ bfqd->sync_flight--;
4348 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
4349 ++ }
4350 ++
4351 ++ /*
4352 ++ * If we are waiting to discover whether the request pattern of the
4353 ++ * task associated with the queue is actually isochronous, and
4354 ++ * both requisites for this condition to hold are satisfied, then
4355 ++ * compute soft_rt_next_start (see the comments to the function
4356 ++ * bfq_bfqq_softrt_next_start()).
4357 ++ */
4358 ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4359 ++ RB_EMPTY_ROOT(&bfqq->sort_list))
4360 ++ bfqq->soft_rt_next_start =
4361 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
4362 ++
4363 ++ /*
4364 ++ * If this is the in-service queue, check if it needs to be expired,
4365 ++ * or if we want to idle in case it has no pending requests.
4366 ++ */
4367 ++ if (bfqd->in_service_queue == bfqq) {
4368 ++ if (bfq_bfqq_budget_new(bfqq))
4369 ++ bfq_set_budget_timeout(bfqd);
4370 ++
4371 ++ if (bfq_bfqq_must_idle(bfqq)) {
4372 ++ bfq_arm_slice_timer(bfqd);
4373 ++ goto out;
4374 ++ } else if (bfq_may_expire_for_budg_timeout(bfqq))
4375 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
4376 ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
4377 ++ (bfqq->dispatched == 0 ||
4378 ++ !bfq_bfqq_must_not_expire(bfqq)))
4379 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
4380 ++ BFQ_BFQQ_NO_MORE_REQUESTS);
4381 ++ }
4382 ++
4383 ++ if (!bfqd->rq_in_driver)
4384 ++ bfq_schedule_dispatch(bfqd);
4385 ++
4386 ++out:
4387 ++ return;
4388 ++}
4389 ++
4390 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
4391 ++{
4392 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
4393 ++ bfq_clear_bfqq_must_alloc(bfqq);
4394 ++ return ELV_MQUEUE_MUST;
4395 ++ }
4396 ++
4397 ++ return ELV_MQUEUE_MAY;
4398 ++}
4399 ++
4400 ++static int bfq_may_queue(struct request_queue *q, int rw)
4401 ++{
4402 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
4403 ++ struct task_struct *tsk = current;
4404 ++ struct bfq_io_cq *bic;
4405 ++ struct bfq_queue *bfqq;
4406 ++
4407 ++ /*
4408 ++ * Don't force setup of a queue from here, as a call to may_queue
4409 ++ * does not necessarily imply that a request actually will be
4410 ++ * queued. So just lookup a possibly existing queue, or return
4411 ++ * 'may queue' if that fails.
4412 ++ */
4413 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
4414 ++ if (bic == NULL)
4415 ++ return ELV_MQUEUE_MAY;
4416 ++
4417 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
4418 ++ if (bfqq != NULL)
4419 ++ return __bfq_may_queue(bfqq);
4420 ++
4421 ++ return ELV_MQUEUE_MAY;
4422 ++}
4423 ++
4424 ++/*
4425 ++ * Queue lock held here.
4426 ++ */
4427 ++static void bfq_put_request(struct request *rq)
4428 ++{
4429 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
4430 ++
4431 ++ if (bfqq != NULL) {
4432 ++ const int rw = rq_data_dir(rq);
4433 ++
4434 ++ BUG_ON(!bfqq->allocated[rw]);
4435 ++ bfqq->allocated[rw]--;
4436 ++
4437 ++ rq->elv.priv[0] = NULL;
4438 ++ rq->elv.priv[1] = NULL;
4439 ++
4440 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
4441 ++ bfqq, atomic_read(&bfqq->ref));
4442 ++ bfq_put_queue(bfqq);
4443 ++ }
4444 ++}
4445 ++
4446 ++static struct bfq_queue *
4447 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
4448 ++ struct bfq_queue *bfqq)
4449 ++{
4450 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
4451 ++ (long unsigned)bfqq->new_bfqq->pid);
4452 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
4453 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
4454 ++ bfq_put_queue(bfqq);
4455 ++ return bic_to_bfqq(bic, 1);
4456 ++}
4457 ++
4458 ++/*
4459 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
4460 ++ * was the last process referring to said bfqq.
4461 ++ */
4462 ++static struct bfq_queue *
4463 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
4464 ++{
4465 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
4466 ++ if (bfqq_process_refs(bfqq) == 1) {
4467 ++ bfqq->pid = current->pid;
4468 ++ bfq_clear_bfqq_coop(bfqq);
4469 ++ bfq_clear_bfqq_split_coop(bfqq);
4470 ++ return bfqq;
4471 ++ }
4472 ++
4473 ++ bic_set_bfqq(bic, NULL, 1);
4474 ++
4475 ++ bfq_put_cooperator(bfqq);
4476 ++
4477 ++ bfq_put_queue(bfqq);
4478 ++ return NULL;
4479 ++}
4480 ++
4481 ++/*
4482 ++ * Allocate bfq data structures associated with this request.
4483 ++ */
4484 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
4485 ++ struct bio *bio, gfp_t gfp_mask)
4486 ++{
4487 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
4488 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
4489 ++ const int rw = rq_data_dir(rq);
4490 ++ const int is_sync = rq_is_sync(rq);
4491 ++ struct bfq_queue *bfqq;
4492 ++ struct bfq_group *bfqg;
4493 ++ unsigned long flags;
4494 ++
4495 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
4496 ++
4497 ++ bfq_check_ioprio_change(bic);
4498 ++
4499 ++ spin_lock_irqsave(q->queue_lock, flags);
4500 ++
4501 ++ if (bic == NULL)
4502 ++ goto queue_fail;
4503 ++
4504 ++ bfqg = bfq_bic_update_cgroup(bic);
4505 ++
4506 ++new_queue:
4507 ++ bfqq = bic_to_bfqq(bic, is_sync);
4508 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
4509 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
4510 ++ bic_set_bfqq(bic, bfqq, is_sync);
4511 ++ } else {
4512 ++ /*
4513 ++ * If the queue was seeky for too long, break it apart.
4514 ++ */
4515 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
4516 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
4517 ++ bfqq = bfq_split_bfqq(bic, bfqq);
4518 ++ if (!bfqq)
4519 ++ goto new_queue;
4520 ++ }
4521 ++
4522 ++ /*
4523 ++ * Check to see if this queue is scheduled to merge with
4524 ++ * another closely cooperating queue. The merging of queues
4525 ++ * happens here as it must be done in process context.
4526 ++ * The reference on new_bfqq was taken in merge_bfqqs.
4527 ++ */
4528 ++ if (bfqq->new_bfqq != NULL)
4529 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
4530 ++ }
4531 ++
4532 ++ bfqq->allocated[rw]++;
4533 ++ atomic_inc(&bfqq->ref);
4534 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
4535 ++ atomic_read(&bfqq->ref));
4536 ++
4537 ++ rq->elv.priv[0] = bic;
4538 ++ rq->elv.priv[1] = bfqq;
4539 ++
4540 ++ spin_unlock_irqrestore(q->queue_lock, flags);
4541 ++
4542 ++ return 0;
4543 ++
4544 ++queue_fail:
4545 ++ bfq_schedule_dispatch(bfqd);
4546 ++ spin_unlock_irqrestore(q->queue_lock, flags);
4547 ++
4548 ++ return 1;
4549 ++}
4550 ++
4551 ++static void bfq_kick_queue(struct work_struct *work)
4552 ++{
4553 ++ struct bfq_data *bfqd =
4554 ++ container_of(work, struct bfq_data, unplug_work);
4555 ++ struct request_queue *q = bfqd->queue;
4556 ++
4557 ++ spin_lock_irq(q->queue_lock);
4558 ++ __blk_run_queue(q);
4559 ++ spin_unlock_irq(q->queue_lock);
4560 ++}
4561 ++
4562 ++/*
4563 ++ * Handler of the expiration of the timer running if the in-service queue
4564 ++ * is idling inside its time slice.
4565 ++ */
4566 ++static void bfq_idle_slice_timer(unsigned long data)
4567 ++{
4568 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
4569 ++ struct bfq_queue *bfqq;
4570 ++ unsigned long flags;
4571 ++ enum bfqq_expiration reason;
4572 ++
4573 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
4574 ++
4575 ++ bfqq = bfqd->in_service_queue;
4576 ++ /*
4577 ++ * Theoretical race here: the in-service queue can be NULL or
4578 ++ * different from the queue that was idling if the timer handler
4579 ++ * spins on the queue_lock and a new request arrives for the
4580 ++ * current queue and there is a full dispatch cycle that changes
4581 ++ * the in-service queue. This can hardly happen, but in the worst
4582 ++ * case we just expire a queue too early.
4583 ++ */
4584 ++ if (bfqq != NULL) {
4585 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
4586 ++ if (bfq_bfqq_budget_timeout(bfqq))
4587 ++ /*
4588 ++ * Also here the queue can be safely expired
4589 ++ * for budget timeout without wasting
4590 ++ * guarantees
4591 ++ */
4592 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
4593 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
4594 ++ /*
4595 ++ * The queue may not be empty upon timer expiration,
4596 ++ * because we may not disable the timer when the
4597 ++ * first request of the in-service queue arrives
4598 ++ * during disk idling.
4599 ++ */
4600 ++ reason = BFQ_BFQQ_TOO_IDLE;
4601 ++ else
4602 ++ goto schedule_dispatch;
4603 ++
4604 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
4605 ++ }
4606 ++
4607 ++schedule_dispatch:
4608 ++ bfq_schedule_dispatch(bfqd);
4609 ++
4610 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
4611 ++}
4612 ++
4613 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
4614 ++{
4615 ++ del_timer_sync(&bfqd->idle_slice_timer);
4616 ++ cancel_work_sync(&bfqd->unplug_work);
4617 ++}
4618 ++
4619 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
4620 ++ struct bfq_queue **bfqq_ptr)
4621 ++{
4622 ++ struct bfq_group *root_group = bfqd->root_group;
4623 ++ struct bfq_queue *bfqq = *bfqq_ptr;
4624 ++
4625 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
4626 ++ if (bfqq != NULL) {
4627 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
4628 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
4629 ++ bfqq, atomic_read(&bfqq->ref));
4630 ++ bfq_put_queue(bfqq);
4631 ++ *bfqq_ptr = NULL;
4632 ++ }
4633 ++}
4634 ++
4635 ++/*
4636 ++ * Release all the bfqg references to its async queues. If we are
4637 ++ * deallocating the group these queues may still contain requests, so
4638 ++ * we reparent them to the root cgroup (i.e., the only one that will
4639 ++ * exist for sure until all the requests on a device are gone).
4640 ++ */
4641 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
4642 ++{
4643 ++ int i, j;
4644 ++
4645 ++ for (i = 0; i < 2; i++)
4646 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
4647 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
4648 ++
4649 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
4650 ++}
4651 ++
4652 ++static void bfq_exit_queue(struct elevator_queue *e)
4653 ++{
4654 ++ struct bfq_data *bfqd = e->elevator_data;
4655 ++ struct request_queue *q = bfqd->queue;
4656 ++ struct bfq_queue *bfqq, *n;
4657 ++
4658 ++ bfq_shutdown_timer_wq(bfqd);
4659 ++
4660 ++ spin_lock_irq(q->queue_lock);
4661 ++
4662 ++ BUG_ON(bfqd->in_service_queue != NULL);
4663 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
4664 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
4665 ++
4666 ++ bfq_disconnect_groups(bfqd);
4667 ++ spin_unlock_irq(q->queue_lock);
4668 ++
4669 ++ bfq_shutdown_timer_wq(bfqd);
4670 ++
4671 ++ synchronize_rcu();
4672 ++
4673 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
4674 ++
4675 ++ bfq_free_root_group(bfqd);
4676 ++ kfree(bfqd);
4677 ++}
4678 ++
4679 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4680 ++{
4681 ++ struct bfq_group *bfqg;
4682 ++ struct bfq_data *bfqd;
4683 ++ struct elevator_queue *eq;
4684 ++
4685 ++ eq = elevator_alloc(q, e);
4686 ++ if (eq == NULL)
4687 ++ return -ENOMEM;
4688 ++
4689 ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
4690 ++ if (bfqd == NULL) {
4691 ++ kobject_put(&eq->kobj);
4692 ++ return -ENOMEM;
4693 ++ }
4694 ++ eq->elevator_data = bfqd;
4695 ++
4696 ++ /*
4697 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4698 ++ * Grab a permanent reference to it, so that the normal code flow
4699 ++ * will not attempt to free it.
4700 ++ */
4701 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
4702 ++ atomic_inc(&bfqd->oom_bfqq.ref);
4703 ++ bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
4704 ++ bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
4705 ++ bfqd->oom_bfqq.entity.new_weight =
4706 ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
4707 ++ /*
4708 ++ * Trigger weight initialization, according to ioprio, at the
4709 ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
4710 ++ * class won't be changed any more.
4711 ++ */
4712 ++ bfqd->oom_bfqq.entity.ioprio_changed = 1;
4713 ++
4714 ++ bfqd->queue = q;
4715 ++
4716 ++ spin_lock_irq(q->queue_lock);
4717 ++ q->elevator = eq;
4718 ++ spin_unlock_irq(q->queue_lock);
4719 ++
4720 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
4721 ++ if (bfqg == NULL) {
4722 ++ kfree(bfqd);
4723 ++ kobject_put(&eq->kobj);
4724 ++ return -ENOMEM;
4725 ++ }
4726 ++
4727 ++ bfqd->root_group = bfqg;
4728 ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
4729 ++#ifdef CONFIG_CGROUP_BFQIO
4730 ++ bfqd->active_numerous_groups = 0;
4731 ++#endif
4732 ++
4733 ++ init_timer(&bfqd->idle_slice_timer);
4734 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4735 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
4736 ++
4737 ++ bfqd->rq_pos_tree = RB_ROOT;
4738 ++ bfqd->queue_weights_tree = RB_ROOT;
4739 ++ bfqd->group_weights_tree = RB_ROOT;
4740 ++
4741 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
4742 ++
4743 ++ INIT_LIST_HEAD(&bfqd->active_list);
4744 ++ INIT_LIST_HEAD(&bfqd->idle_list);
4745 ++ INIT_HLIST_HEAD(&bfqd->burst_list);
4746 ++
4747 ++ bfqd->hw_tag = -1;
4748 ++
4749 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
4750 ++
4751 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4752 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4753 ++ bfqd->bfq_back_max = bfq_back_max;
4754 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
4755 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
4756 ++ bfqd->bfq_class_idle_last_service = 0;
4757 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
4758 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
4759 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4760 ++
4761 ++ bfqd->bfq_coop_thresh = 2;
4762 ++ bfqd->bfq_failed_cooperations = 7000;
4763 ++ bfqd->bfq_requests_within_timer = 120;
4764 ++
4765 ++ bfqd->bfq_large_burst_thresh = 11;
4766 ++ bfqd->bfq_burst_interval = msecs_to_jiffies(500);
4767 ++
4768 ++ bfqd->low_latency = true;
4769 ++
4770 ++ bfqd->bfq_wr_coeff = 20;
4771 ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
4772 ++ bfqd->bfq_wr_max_time = 0;
4773 ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
4774 ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
4775 ++ bfqd->bfq_wr_max_softrt_rate = 7000; /*
4776 ++ * Approximate rate required
4777 ++ * to playback or record a
4778 ++ * high-definition compressed
4779 ++ * video.
4780 ++ */
4781 ++ bfqd->wr_busy_queues = 0;
4782 ++ bfqd->busy_in_flight_queues = 0;
4783 ++ bfqd->const_seeky_busy_in_flight_queues = 0;
4784 ++
4785 ++ /*
4786 ++ * Begin by assuming, optimistically, that the device peak rate is
4787 ++ * equal to the highest reference rate.
4788 ++ */
4789 ++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
4790 ++ T_fast[blk_queue_nonrot(bfqd->queue)];
4791 ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
4792 ++ bfqd->device_speed = BFQ_BFQD_FAST;
4793 ++
4794 ++ return 0;
4795 ++}
4796 ++
4797 ++static void bfq_slab_kill(void)
4798 ++{
4799 ++ if (bfq_pool != NULL)
4800 ++ kmem_cache_destroy(bfq_pool);
4801 ++}
4802 ++
4803 ++static int __init bfq_slab_setup(void)
4804 ++{
4805 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4806 ++ if (bfq_pool == NULL)
4807 ++ return -ENOMEM;
4808 ++ return 0;
4809 ++}
4810 ++
4811 ++static ssize_t bfq_var_show(unsigned int var, char *page)
4812 ++{
4813 ++ return sprintf(page, "%d\n", var);
4814 ++}
4815 ++
4816 ++static ssize_t bfq_var_store(unsigned long *var, const char *page,
4817 ++ size_t count)
4818 ++{
4819 ++ unsigned long new_val;
4820 ++ int ret = kstrtoul(page, 10, &new_val);
4821 ++
4822 ++ if (ret == 0)
4823 ++ *var = new_val;
4824 ++
4825 ++ return count;
4826 ++}
4827 ++
4828 ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
4829 ++{
4830 ++ struct bfq_data *bfqd = e->elevator_data;
4831 ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
4832 ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) :
4833 ++ jiffies_to_msecs(bfq_wr_duration(bfqd)));
4834 ++}
4835 ++
4836 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4837 ++{
4838 ++ struct bfq_queue *bfqq;
4839 ++ struct bfq_data *bfqd = e->elevator_data;
4840 ++ ssize_t num_char = 0;
4841 ++
4842 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4843 ++ bfqd->queued);
4844 ++
4845 ++ spin_lock_irq(bfqd->queue->queue_lock);
4846 ++
4847 ++ num_char += sprintf(page + num_char, "Active:\n");
4848 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4849 ++ num_char += sprintf(page + num_char,
4850 ++ "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
4851 ++ bfqq->pid,
4852 ++ bfqq->entity.weight,
4853 ++ bfqq->queued[0],
4854 ++ bfqq->queued[1],
4855 ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
4856 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
4857 ++ }
4858 ++
4859 ++ num_char += sprintf(page + num_char, "Idle:\n");
4860 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4861 ++ num_char += sprintf(page + num_char,
4862 ++ "pid%d: weight %hu, dur %d/%u\n",
4863 ++ bfqq->pid,
4864 ++ bfqq->entity.weight,
4865 ++ jiffies_to_msecs(jiffies -
4866 ++ bfqq->last_wr_start_finish),
4867 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
4868 ++ }
4869 ++
4870 ++ spin_unlock_irq(bfqd->queue->queue_lock);
4871 ++
4872 ++ return num_char;
4873 ++}
4874 ++
4875 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4876 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4877 ++{ \
4878 ++ struct bfq_data *bfqd = e->elevator_data; \
4879 ++ unsigned int __data = __VAR; \
4880 ++ if (__CONV) \
4881 ++ __data = jiffies_to_msecs(__data); \
4882 ++ return bfq_var_show(__data, (page)); \
4883 ++}
4884 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4885 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4886 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4887 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4888 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4889 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4890 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show,
4891 ++ bfqd->bfq_max_budget_async_rq, 0);
4892 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4893 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4894 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4895 ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
4896 ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
4897 ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
4898 ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
4899 ++ 1);
4900 ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
4901 ++#undef SHOW_FUNCTION
4902 ++
4903 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4904 ++static ssize_t \
4905 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4906 ++{ \
4907 ++ struct bfq_data *bfqd = e->elevator_data; \
4908 ++ unsigned long uninitialized_var(__data); \
4909 ++ int ret = bfq_var_store(&__data, (page), count); \
4910 ++ if (__data < (MIN)) \
4911 ++ __data = (MIN); \
4912 ++ else if (__data > (MAX)) \
4913 ++ __data = (MAX); \
4914 ++ if (__CONV) \
4915 ++ *(__PTR) = msecs_to_jiffies(__data); \
4916 ++ else \
4917 ++ *(__PTR) = __data; \
4918 ++ return ret; \
4919 ++}
4920 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4921 ++ INT_MAX, 1);
4922 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4923 ++ INT_MAX, 1);
4924 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4925 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4926 ++ INT_MAX, 0);
4927 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4928 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4929 ++ 1, INT_MAX, 0);
4930 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4931 ++ INT_MAX, 1);
4932 ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
4933 ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
4934 ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
4935 ++ 1);
4936 ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
4937 ++ INT_MAX, 1);
4938 ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
4939 ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
4940 ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
4941 ++ INT_MAX, 0);
4942 ++#undef STORE_FUNCTION
4943 ++
4944 ++/* do nothing for the moment */
4945 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
4946 ++ const char *page, size_t count)
4947 ++{
4948 ++ return count;
4949 ++}
4950 ++
4951 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4952 ++{
4953 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4954 ++
4955 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4956 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4957 ++ else
4958 ++ return bfq_default_max_budget;
4959 ++}
4960 ++
4961 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4962 ++ const char *page, size_t count)
4963 ++{
4964 ++ struct bfq_data *bfqd = e->elevator_data;
4965 ++ unsigned long uninitialized_var(__data);
4966 ++ int ret = bfq_var_store(&__data, (page), count);
4967 ++
4968 ++ if (__data == 0)
4969 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4970 ++ else {
4971 ++ if (__data > INT_MAX)
4972 ++ __data = INT_MAX;
4973 ++ bfqd->bfq_max_budget = __data;
4974 ++ }
4975 ++
4976 ++ bfqd->bfq_user_max_budget = __data;
4977 ++
4978 ++ return ret;
4979 ++}
4980 ++
4981 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4982 ++ const char *page, size_t count)
4983 ++{
4984 ++ struct bfq_data *bfqd = e->elevator_data;
4985 ++ unsigned long uninitialized_var(__data);
4986 ++ int ret = bfq_var_store(&__data, (page), count);
4987 ++
4988 ++ if (__data < 1)
4989 ++ __data = 1;
4990 ++ else if (__data > INT_MAX)
4991 ++ __data = INT_MAX;
4992 ++
4993 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4994 ++ if (bfqd->bfq_user_max_budget == 0)
4995 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4996 ++
4997 ++ return ret;
4998 ++}
4999 ++
5000 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
5001 ++ const char *page, size_t count)
5002 ++{
5003 ++ struct bfq_data *bfqd = e->elevator_data;
5004 ++ unsigned long uninitialized_var(__data);
5005 ++ int ret = bfq_var_store(&__data, (page), count);
5006 ++
5007 ++ if (__data > 1)
5008 ++ __data = 1;
5009 ++ if (__data == 0 && bfqd->low_latency != 0)
5010 ++ bfq_end_wr(bfqd);
5011 ++ bfqd->low_latency = __data;
5012 ++
5013 ++ return ret;
5014 ++}
5015 ++
5016 ++#define BFQ_ATTR(name) \
5017 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
5018 ++
5019 ++static struct elv_fs_entry bfq_attrs[] = {
5020 ++ BFQ_ATTR(fifo_expire_sync),
5021 ++ BFQ_ATTR(fifo_expire_async),
5022 ++ BFQ_ATTR(back_seek_max),
5023 ++ BFQ_ATTR(back_seek_penalty),
5024 ++ BFQ_ATTR(slice_idle),
5025 ++ BFQ_ATTR(max_budget),
5026 ++ BFQ_ATTR(max_budget_async_rq),
5027 ++ BFQ_ATTR(timeout_sync),
5028 ++ BFQ_ATTR(timeout_async),
5029 ++ BFQ_ATTR(low_latency),
5030 ++ BFQ_ATTR(wr_coeff),
5031 ++ BFQ_ATTR(wr_max_time),
5032 ++ BFQ_ATTR(wr_rt_max_time),
5033 ++ BFQ_ATTR(wr_min_idle_time),
5034 ++ BFQ_ATTR(wr_min_inter_arr_async),
5035 ++ BFQ_ATTR(wr_max_softrt_rate),
5036 ++ BFQ_ATTR(weights),
5037 ++ __ATTR_NULL
5038 ++};
5039 ++
5040 ++static struct elevator_type iosched_bfq = {
5041 ++ .ops = {
5042 ++ .elevator_merge_fn = bfq_merge,
5043 ++ .elevator_merged_fn = bfq_merged_request,
5044 ++ .elevator_merge_req_fn = bfq_merged_requests,
5045 ++ .elevator_allow_merge_fn = bfq_allow_merge,
5046 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
5047 ++ .elevator_add_req_fn = bfq_insert_request,
5048 ++ .elevator_activate_req_fn = bfq_activate_request,
5049 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
5050 ++ .elevator_completed_req_fn = bfq_completed_request,
5051 ++ .elevator_former_req_fn = elv_rb_former_request,
5052 ++ .elevator_latter_req_fn = elv_rb_latter_request,
5053 ++ .elevator_init_icq_fn = bfq_init_icq,
5054 ++ .elevator_exit_icq_fn = bfq_exit_icq,
5055 ++ .elevator_set_req_fn = bfq_set_request,
5056 ++ .elevator_put_req_fn = bfq_put_request,
5057 ++ .elevator_may_queue_fn = bfq_may_queue,
5058 ++ .elevator_init_fn = bfq_init_queue,
5059 ++ .elevator_exit_fn = bfq_exit_queue,
5060 ++ },
5061 ++ .icq_size = sizeof(struct bfq_io_cq),
5062 ++ .icq_align = __alignof__(struct bfq_io_cq),
5063 ++ .elevator_attrs = bfq_attrs,
5064 ++ .elevator_name = "bfq",
5065 ++ .elevator_owner = THIS_MODULE,
5066 ++};
5067 ++
5068 ++static int __init bfq_init(void)
5069 ++{
5070 ++ /*
5071 ++ * Can be 0 on HZ < 1000 setups.
5072 ++ */
5073 ++ if (bfq_slice_idle == 0)
5074 ++ bfq_slice_idle = 1;
5075 ++
5076 ++ if (bfq_timeout_async == 0)
5077 ++ bfq_timeout_async = 1;
5078 ++
5079 ++ if (bfq_slab_setup())
5080 ++ return -ENOMEM;
5081 ++
5082 ++ /*
5083 ++ * Times to load large popular applications for the typical systems
5084 ++ * installed on the reference devices (see the comments before the
5085 ++ * definitions of the two arrays).
5086 ++ */
5087 ++ T_slow[0] = msecs_to_jiffies(2600);
5088 ++ T_slow[1] = msecs_to_jiffies(1000);
5089 ++ T_fast[0] = msecs_to_jiffies(5500);
5090 ++ T_fast[1] = msecs_to_jiffies(2000);
5091 ++
5092 ++ /*
5093 ++ * Thresholds that determine the switch between speed classes (see
5094 ++ * the comments before the definition of the array).
5095 ++ */
5096 ++ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
5097 ++ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
5098 ++
5099 ++ elv_register(&iosched_bfq);
5100 ++ pr_info("BFQ I/O-scheduler: v7r8");
5101 ++
5102 ++ return 0;
5103 ++}
5104 ++
5105 ++static void __exit bfq_exit(void)
5106 ++{
5107 ++ elv_unregister(&iosched_bfq);
5108 ++ bfq_slab_kill();
5109 ++}
5110 ++
5111 ++module_init(bfq_init);
5112 ++module_exit(bfq_exit);
5113 ++
5114 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
5115 ++MODULE_LICENSE("GPL");
5116 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
5117 +new file mode 100644
5118 +index 0000000..c343099
5119 +--- /dev/null
5120 ++++ b/block/bfq-sched.c
5121 +@@ -0,0 +1,1208 @@
5122 ++/*
5123 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
5124 ++ *
5125 ++ * Based on ideas and code from CFQ:
5126 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5127 ++ *
5128 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5129 ++ * Paolo Valente <paolo.valente@×××××××.it>
5130 ++ *
5131 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5132 ++ */
5133 ++
5134 ++#ifdef CONFIG_CGROUP_BFQIO
5135 ++#define for_each_entity(entity) \
5136 ++ for (; entity != NULL; entity = entity->parent)
5137 ++
5138 ++#define for_each_entity_safe(entity, parent) \
5139 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
5140 ++
5141 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5142 ++ int extract,
5143 ++ struct bfq_data *bfqd);
5144 ++
5145 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
5146 ++{
5147 ++ struct bfq_entity *bfqg_entity;
5148 ++ struct bfq_group *bfqg;
5149 ++ struct bfq_sched_data *group_sd;
5150 ++
5151 ++ BUG_ON(next_in_service == NULL);
5152 ++
5153 ++ group_sd = next_in_service->sched_data;
5154 ++
5155 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
5156 ++ /*
5157 ++ * bfq_group's my_entity field is not NULL only if the group
5158 ++ * is not the root group. We must not touch the root entity
5159 ++ * as it must never become an in-service entity.
5160 ++ */
5161 ++ bfqg_entity = bfqg->my_entity;
5162 ++ if (bfqg_entity != NULL)
5163 ++ bfqg_entity->budget = next_in_service->budget;
5164 ++}
5165 ++
5166 ++static int bfq_update_next_in_service(struct bfq_sched_data *sd)
5167 ++{
5168 ++ struct bfq_entity *next_in_service;
5169 ++
5170 ++ if (sd->in_service_entity != NULL)
5171 ++ /* will update/requeue at the end of service */
5172 ++ return 0;
5173 ++
5174 ++ /*
5175 ++ * NOTE: this can be improved in many ways, such as returning
5176 ++ * 1 (and thus propagating upwards the update) only when the
5177 ++ * budget changes, or caching the bfqq that will be scheduled
5178 ++ * next from this subtree. By now we worry more about
5179 ++ * correctness than about performance...
5180 ++ */
5181 ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
5182 ++ sd->next_in_service = next_in_service;
5183 ++
5184 ++ if (next_in_service != NULL)
5185 ++ bfq_update_budget(next_in_service);
5186 ++
5187 ++ return 1;
5188 ++}
5189 ++
5190 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
5191 ++ struct bfq_entity *entity)
5192 ++{
5193 ++ BUG_ON(sd->next_in_service != entity);
5194 ++}
5195 ++#else
5196 ++#define for_each_entity(entity) \
5197 ++ for (; entity != NULL; entity = NULL)
5198 ++
5199 ++#define for_each_entity_safe(entity, parent) \
5200 ++ for (parent = NULL; entity != NULL; entity = parent)
5201 ++
5202 ++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
5203 ++{
5204 ++ return 0;
5205 ++}
5206 ++
5207 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
5208 ++ struct bfq_entity *entity)
5209 ++{
5210 ++}
5211 ++
5212 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
5213 ++{
5214 ++}
5215 ++#endif
5216 ++
5217 ++/*
5218 ++ * Shift for timestamp calculations. This actually limits the maximum
5219 ++ * service allowed in one timestamp delta (small shift values increase it),
5220 ++ * the maximum total weight that can be used for the queues in the system
5221 ++ * (big shift values increase it), and the period of virtual time
5222 ++ * wraparounds.
5223 ++ */
5224 ++#define WFQ_SERVICE_SHIFT 22
5225 ++
5226 ++/**
5227 ++ * bfq_gt - compare two timestamps.
5228 ++ * @a: first ts.
5229 ++ * @b: second ts.
5230 ++ *
5231 ++ * Return @a > @b, dealing with wrapping correctly.
5232 ++ */
5233 ++static inline int bfq_gt(u64 a, u64 b)
5234 ++{
5235 ++ return (s64)(a - b) > 0;
5236 ++}
5237 ++
5238 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
5239 ++{
5240 ++ struct bfq_queue *bfqq = NULL;
5241 ++
5242 ++ BUG_ON(entity == NULL);
5243 ++
5244 ++ if (entity->my_sched_data == NULL)
5245 ++ bfqq = container_of(entity, struct bfq_queue, entity);
5246 ++
5247 ++ return bfqq;
5248 ++}
5249 ++
5250 ++
5251 ++/**
5252 ++ * bfq_delta - map service into the virtual time domain.
5253 ++ * @service: amount of service.
5254 ++ * @weight: scale factor (weight of an entity or weight sum).
5255 ++ */
5256 ++static inline u64 bfq_delta(unsigned long service,
5257 ++ unsigned long weight)
5258 ++{
5259 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
5260 ++
5261 ++ do_div(d, weight);
5262 ++ return d;
5263 ++}
5264 ++
5265 ++/**
5266 ++ * bfq_calc_finish - assign the finish time to an entity.
5267 ++ * @entity: the entity to act upon.
5268 ++ * @service: the service to be charged to the entity.
5269 ++ */
5270 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
5271 ++ unsigned long service)
5272 ++{
5273 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5274 ++
5275 ++ BUG_ON(entity->weight == 0);
5276 ++
5277 ++ entity->finish = entity->start +
5278 ++ bfq_delta(service, entity->weight);
5279 ++
5280 ++ if (bfqq != NULL) {
5281 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
5282 ++ "calc_finish: serv %lu, w %d",
5283 ++ service, entity->weight);
5284 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
5285 ++ "calc_finish: start %llu, finish %llu, delta %llu",
5286 ++ entity->start, entity->finish,
5287 ++ bfq_delta(service, entity->weight));
5288 ++ }
5289 ++}
5290 ++
5291 ++/**
5292 ++ * bfq_entity_of - get an entity from a node.
5293 ++ * @node: the node field of the entity.
5294 ++ *
5295 ++ * Convert a node pointer to the relative entity. This is used only
5296 ++ * to simplify the logic of some functions and not as the generic
5297 ++ * conversion mechanism because, e.g., in the tree walking functions,
5298 ++ * the check for a %NULL value would be redundant.
5299 ++ */
5300 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
5301 ++{
5302 ++ struct bfq_entity *entity = NULL;
5303 ++
5304 ++ if (node != NULL)
5305 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
5306 ++
5307 ++ return entity;
5308 ++}
5309 ++
5310 ++/**
5311 ++ * bfq_extract - remove an entity from a tree.
5312 ++ * @root: the tree root.
5313 ++ * @entity: the entity to remove.
5314 ++ */
5315 ++static inline void bfq_extract(struct rb_root *root,
5316 ++ struct bfq_entity *entity)
5317 ++{
5318 ++ BUG_ON(entity->tree != root);
5319 ++
5320 ++ entity->tree = NULL;
5321 ++ rb_erase(&entity->rb_node, root);
5322 ++}
5323 ++
5324 ++/**
5325 ++ * bfq_idle_extract - extract an entity from the idle tree.
5326 ++ * @st: the service tree of the owning @entity.
5327 ++ * @entity: the entity being removed.
5328 ++ */
5329 ++static void bfq_idle_extract(struct bfq_service_tree *st,
5330 ++ struct bfq_entity *entity)
5331 ++{
5332 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5333 ++ struct rb_node *next;
5334 ++
5335 ++ BUG_ON(entity->tree != &st->idle);
5336 ++
5337 ++ if (entity == st->first_idle) {
5338 ++ next = rb_next(&entity->rb_node);
5339 ++ st->first_idle = bfq_entity_of(next);
5340 ++ }
5341 ++
5342 ++ if (entity == st->last_idle) {
5343 ++ next = rb_prev(&entity->rb_node);
5344 ++ st->last_idle = bfq_entity_of(next);
5345 ++ }
5346 ++
5347 ++ bfq_extract(&st->idle, entity);
5348 ++
5349 ++ if (bfqq != NULL)
5350 ++ list_del(&bfqq->bfqq_list);
5351 ++}
5352 ++
5353 ++/**
5354 ++ * bfq_insert - generic tree insertion.
5355 ++ * @root: tree root.
5356 ++ * @entity: entity to insert.
5357 ++ *
5358 ++ * This is used for the idle and the active tree, since they are both
5359 ++ * ordered by finish time.
5360 ++ */
5361 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
5362 ++{
5363 ++ struct bfq_entity *entry;
5364 ++ struct rb_node **node = &root->rb_node;
5365 ++ struct rb_node *parent = NULL;
5366 ++
5367 ++ BUG_ON(entity->tree != NULL);
5368 ++
5369 ++ while (*node != NULL) {
5370 ++ parent = *node;
5371 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
5372 ++
5373 ++ if (bfq_gt(entry->finish, entity->finish))
5374 ++ node = &parent->rb_left;
5375 ++ else
5376 ++ node = &parent->rb_right;
5377 ++ }
5378 ++
5379 ++ rb_link_node(&entity->rb_node, parent, node);
5380 ++ rb_insert_color(&entity->rb_node, root);
5381 ++
5382 ++ entity->tree = root;
5383 ++}
5384 ++
5385 ++/**
5386 ++ * bfq_update_min - update the min_start field of a entity.
5387 ++ * @entity: the entity to update.
5388 ++ * @node: one of its children.
5389 ++ *
5390 ++ * This function is called when @entity may store an invalid value for
5391 ++ * min_start due to updates to the active tree. The function assumes
5392 ++ * that the subtree rooted at @node (which may be its left or its right
5393 ++ * child) has a valid min_start value.
5394 ++ */
5395 ++static inline void bfq_update_min(struct bfq_entity *entity,
5396 ++ struct rb_node *node)
5397 ++{
5398 ++ struct bfq_entity *child;
5399 ++
5400 ++ if (node != NULL) {
5401 ++ child = rb_entry(node, struct bfq_entity, rb_node);
5402 ++ if (bfq_gt(entity->min_start, child->min_start))
5403 ++ entity->min_start = child->min_start;
5404 ++ }
5405 ++}
5406 ++
5407 ++/**
5408 ++ * bfq_update_active_node - recalculate min_start.
5409 ++ * @node: the node to update.
5410 ++ *
5411 ++ * @node may have changed position or one of its children may have moved,
5412 ++ * this function updates its min_start value. The left and right subtrees
5413 ++ * are assumed to hold a correct min_start value.
5414 ++ */
5415 ++static inline void bfq_update_active_node(struct rb_node *node)
5416 ++{
5417 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
5418 ++
5419 ++ entity->min_start = entity->start;
5420 ++ bfq_update_min(entity, node->rb_right);
5421 ++ bfq_update_min(entity, node->rb_left);
5422 ++}
5423 ++
5424 ++/**
5425 ++ * bfq_update_active_tree - update min_start for the whole active tree.
5426 ++ * @node: the starting node.
5427 ++ *
5428 ++ * @node must be the deepest modified node after an update. This function
5429 ++ * updates its min_start using the values held by its children, assuming
5430 ++ * that they did not change, and then updates all the nodes that may have
5431 ++ * changed in the path to the root. The only nodes that may have changed
5432 ++ * are the ones in the path or their siblings.
5433 ++ */
5434 ++static void bfq_update_active_tree(struct rb_node *node)
5435 ++{
5436 ++ struct rb_node *parent;
5437 ++
5438 ++up:
5439 ++ bfq_update_active_node(node);
5440 ++
5441 ++ parent = rb_parent(node);
5442 ++ if (parent == NULL)
5443 ++ return;
5444 ++
5445 ++ if (node == parent->rb_left && parent->rb_right != NULL)
5446 ++ bfq_update_active_node(parent->rb_right);
5447 ++ else if (parent->rb_left != NULL)
5448 ++ bfq_update_active_node(parent->rb_left);
5449 ++
5450 ++ node = parent;
5451 ++ goto up;
5452 ++}
5453 ++
5454 ++static void bfq_weights_tree_add(struct bfq_data *bfqd,
5455 ++ struct bfq_entity *entity,
5456 ++ struct rb_root *root);
5457 ++
5458 ++static void bfq_weights_tree_remove(struct bfq_data *bfqd,
5459 ++ struct bfq_entity *entity,
5460 ++ struct rb_root *root);
5461 ++
5462 ++
5463 ++/**
5464 ++ * bfq_active_insert - insert an entity in the active tree of its
5465 ++ * group/device.
5466 ++ * @st: the service tree of the entity.
5467 ++ * @entity: the entity being inserted.
5468 ++ *
5469 ++ * The active tree is ordered by finish time, but an extra key is kept
5470 ++ * per each node, containing the minimum value for the start times of
5471 ++ * its children (and the node itself), so it's possible to search for
5472 ++ * the eligible node with the lowest finish time in logarithmic time.
5473 ++ */
5474 ++static void bfq_active_insert(struct bfq_service_tree *st,
5475 ++ struct bfq_entity *entity)
5476 ++{
5477 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5478 ++ struct rb_node *node = &entity->rb_node;
5479 ++#ifdef CONFIG_CGROUP_BFQIO
5480 ++ struct bfq_sched_data *sd = NULL;
5481 ++ struct bfq_group *bfqg = NULL;
5482 ++ struct bfq_data *bfqd = NULL;
5483 ++#endif
5484 ++
5485 ++ bfq_insert(&st->active, entity);
5486 ++
5487 ++ if (node->rb_left != NULL)
5488 ++ node = node->rb_left;
5489 ++ else if (node->rb_right != NULL)
5490 ++ node = node->rb_right;
5491 ++
5492 ++ bfq_update_active_tree(node);
5493 ++
5494 ++#ifdef CONFIG_CGROUP_BFQIO
5495 ++ sd = entity->sched_data;
5496 ++ bfqg = container_of(sd, struct bfq_group, sched_data);
5497 ++ BUG_ON(!bfqg);
5498 ++ bfqd = (struct bfq_data *)bfqg->bfqd;
5499 ++#endif
5500 ++ if (bfqq != NULL)
5501 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
5502 ++#ifdef CONFIG_CGROUP_BFQIO
5503 ++ else { /* bfq_group */
5504 ++ BUG_ON(!bfqd);
5505 ++ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
5506 ++ }
5507 ++ if (bfqg != bfqd->root_group) {
5508 ++ BUG_ON(!bfqg);
5509 ++ BUG_ON(!bfqd);
5510 ++ bfqg->active_entities++;
5511 ++ if (bfqg->active_entities == 2)
5512 ++ bfqd->active_numerous_groups++;
5513 ++ }
5514 ++#endif
5515 ++}
5516 ++
5517 ++/**
5518 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
5519 ++ * @ioprio: the ioprio value to convert.
5520 ++ */
5521 ++static inline unsigned short bfq_ioprio_to_weight(int ioprio)
5522 ++{
5523 ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
5524 ++ return IOPRIO_BE_NR - ioprio;
5525 ++}
5526 ++
5527 ++/**
5528 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
5529 ++ * @weight: the weight value to convert.
5530 ++ *
5531 ++ * To preserve as mush as possible the old only-ioprio user interface,
5532 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
5533 ++ * larger than IOPRIO_BE_NR
5534 ++ */
5535 ++static inline unsigned short bfq_weight_to_ioprio(int weight)
5536 ++{
5537 ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
5538 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
5539 ++}
5540 ++
5541 ++static inline void bfq_get_entity(struct bfq_entity *entity)
5542 ++{
5543 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5544 ++
5545 ++ if (bfqq != NULL) {
5546 ++ atomic_inc(&bfqq->ref);
5547 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
5548 ++ bfqq, atomic_read(&bfqq->ref));
5549 ++ }
5550 ++}
5551 ++
5552 ++/**
5553 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
5554 ++ * @node: the node being removed.
5555 ++ *
5556 ++ * Do the first step of an extraction in an rb tree, looking for the
5557 ++ * node that will replace @node, and returning the deepest node that
5558 ++ * the following modifications to the tree can touch. If @node is the
5559 ++ * last node in the tree return %NULL.
5560 ++ */
5561 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
5562 ++{
5563 ++ struct rb_node *deepest;
5564 ++
5565 ++ if (node->rb_right == NULL && node->rb_left == NULL)
5566 ++ deepest = rb_parent(node);
5567 ++ else if (node->rb_right == NULL)
5568 ++ deepest = node->rb_left;
5569 ++ else if (node->rb_left == NULL)
5570 ++ deepest = node->rb_right;
5571 ++ else {
5572 ++ deepest = rb_next(node);
5573 ++ if (deepest->rb_right != NULL)
5574 ++ deepest = deepest->rb_right;
5575 ++ else if (rb_parent(deepest) != node)
5576 ++ deepest = rb_parent(deepest);
5577 ++ }
5578 ++
5579 ++ return deepest;
5580 ++}
5581 ++
5582 ++/**
5583 ++ * bfq_active_extract - remove an entity from the active tree.
5584 ++ * @st: the service_tree containing the tree.
5585 ++ * @entity: the entity being removed.
5586 ++ */
5587 ++static void bfq_active_extract(struct bfq_service_tree *st,
5588 ++ struct bfq_entity *entity)
5589 ++{
5590 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5591 ++ struct rb_node *node;
5592 ++#ifdef CONFIG_CGROUP_BFQIO
5593 ++ struct bfq_sched_data *sd = NULL;
5594 ++ struct bfq_group *bfqg = NULL;
5595 ++ struct bfq_data *bfqd = NULL;
5596 ++#endif
5597 ++
5598 ++ node = bfq_find_deepest(&entity->rb_node);
5599 ++ bfq_extract(&st->active, entity);
5600 ++
5601 ++ if (node != NULL)
5602 ++ bfq_update_active_tree(node);
5603 ++
5604 ++#ifdef CONFIG_CGROUP_BFQIO
5605 ++ sd = entity->sched_data;
5606 ++ bfqg = container_of(sd, struct bfq_group, sched_data);
5607 ++ BUG_ON(!bfqg);
5608 ++ bfqd = (struct bfq_data *)bfqg->bfqd;
5609 ++#endif
5610 ++ if (bfqq != NULL)
5611 ++ list_del(&bfqq->bfqq_list);
5612 ++#ifdef CONFIG_CGROUP_BFQIO
5613 ++ else { /* bfq_group */
5614 ++ BUG_ON(!bfqd);
5615 ++ bfq_weights_tree_remove(bfqd, entity,
5616 ++ &bfqd->group_weights_tree);
5617 ++ }
5618 ++ if (bfqg != bfqd->root_group) {
5619 ++ BUG_ON(!bfqg);
5620 ++ BUG_ON(!bfqd);
5621 ++ BUG_ON(!bfqg->active_entities);
5622 ++ bfqg->active_entities--;
5623 ++ if (bfqg->active_entities == 1) {
5624 ++ BUG_ON(!bfqd->active_numerous_groups);
5625 ++ bfqd->active_numerous_groups--;
5626 ++ }
5627 ++ }
5628 ++#endif
5629 ++}
5630 ++
5631 ++/**
5632 ++ * bfq_idle_insert - insert an entity into the idle tree.
5633 ++ * @st: the service tree containing the tree.
5634 ++ * @entity: the entity to insert.
5635 ++ */
5636 ++static void bfq_idle_insert(struct bfq_service_tree *st,
5637 ++ struct bfq_entity *entity)
5638 ++{
5639 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5640 ++ struct bfq_entity *first_idle = st->first_idle;
5641 ++ struct bfq_entity *last_idle = st->last_idle;
5642 ++
5643 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
5644 ++ st->first_idle = entity;
5645 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
5646 ++ st->last_idle = entity;
5647 ++
5648 ++ bfq_insert(&st->idle, entity);
5649 ++
5650 ++ if (bfqq != NULL)
5651 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
5652 ++}
5653 ++
5654 ++/**
5655 ++ * bfq_forget_entity - remove an entity from the wfq trees.
5656 ++ * @st: the service tree.
5657 ++ * @entity: the entity being removed.
5658 ++ *
5659 ++ * Update the device status and forget everything about @entity, putting
5660 ++ * the device reference to it, if it is a queue. Entities belonging to
5661 ++ * groups are not refcounted.
5662 ++ */
5663 ++static void bfq_forget_entity(struct bfq_service_tree *st,
5664 ++ struct bfq_entity *entity)
5665 ++{
5666 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5667 ++ struct bfq_sched_data *sd;
5668 ++
5669 ++ BUG_ON(!entity->on_st);
5670 ++
5671 ++ entity->on_st = 0;
5672 ++ st->wsum -= entity->weight;
5673 ++ if (bfqq != NULL) {
5674 ++ sd = entity->sched_data;
5675 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
5676 ++ bfqq, atomic_read(&bfqq->ref));
5677 ++ bfq_put_queue(bfqq);
5678 ++ }
5679 ++}
5680 ++
5681 ++/**
5682 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
5683 ++ * @st: service tree for the entity.
5684 ++ * @entity: the entity being released.
5685 ++ */
5686 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
5687 ++ struct bfq_entity *entity)
5688 ++{
5689 ++ bfq_idle_extract(st, entity);
5690 ++ bfq_forget_entity(st, entity);
5691 ++}
5692 ++
5693 ++/**
5694 ++ * bfq_forget_idle - update the idle tree if necessary.
5695 ++ * @st: the service tree to act upon.
5696 ++ *
5697 ++ * To preserve the global O(log N) complexity we only remove one entry here;
5698 ++ * as the idle tree will not grow indefinitely this can be done safely.
5699 ++ */
5700 ++static void bfq_forget_idle(struct bfq_service_tree *st)
5701 ++{
5702 ++ struct bfq_entity *first_idle = st->first_idle;
5703 ++ struct bfq_entity *last_idle = st->last_idle;
5704 ++
5705 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
5706 ++ !bfq_gt(last_idle->finish, st->vtime)) {
5707 ++ /*
5708 ++ * Forget the whole idle tree, increasing the vtime past
5709 ++ * the last finish time of idle entities.
5710 ++ */
5711 ++ st->vtime = last_idle->finish;
5712 ++ }
5713 ++
5714 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
5715 ++ bfq_put_idle_entity(st, first_idle);
5716 ++}
5717 ++
5718 ++static struct bfq_service_tree *
5719 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
5720 ++ struct bfq_entity *entity)
5721 ++{
5722 ++ struct bfq_service_tree *new_st = old_st;
5723 ++
5724 ++ if (entity->ioprio_changed) {
5725 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5726 ++ unsigned short prev_weight, new_weight;
5727 ++ struct bfq_data *bfqd = NULL;
5728 ++ struct rb_root *root;
5729 ++#ifdef CONFIG_CGROUP_BFQIO
5730 ++ struct bfq_sched_data *sd;
5731 ++ struct bfq_group *bfqg;
5732 ++#endif
5733 ++
5734 ++ if (bfqq != NULL)
5735 ++ bfqd = bfqq->bfqd;
5736 ++#ifdef CONFIG_CGROUP_BFQIO
5737 ++ else {
5738 ++ sd = entity->my_sched_data;
5739 ++ bfqg = container_of(sd, struct bfq_group, sched_data);
5740 ++ BUG_ON(!bfqg);
5741 ++ bfqd = (struct bfq_data *)bfqg->bfqd;
5742 ++ BUG_ON(!bfqd);
5743 ++ }
5744 ++#endif
5745 ++
5746 ++ BUG_ON(old_st->wsum < entity->weight);
5747 ++ old_st->wsum -= entity->weight;
5748 ++
5749 ++ if (entity->new_weight != entity->orig_weight) {
5750 ++ if (entity->new_weight < BFQ_MIN_WEIGHT ||
5751 ++ entity->new_weight > BFQ_MAX_WEIGHT) {
5752 ++ printk(KERN_CRIT "update_weight_prio: "
5753 ++ "new_weight %d\n",
5754 ++ entity->new_weight);
5755 ++ BUG();
5756 ++ }
5757 ++ entity->orig_weight = entity->new_weight;
5758 ++ entity->ioprio =
5759 ++ bfq_weight_to_ioprio(entity->orig_weight);
5760 ++ }
5761 ++
5762 ++ entity->ioprio_class = entity->new_ioprio_class;
5763 ++ entity->ioprio_changed = 0;
5764 ++
5765 ++ /*
5766 ++ * NOTE: here we may be changing the weight too early,
5767 ++ * this will cause unfairness. The correct approach
5768 ++ * would have required additional complexity to defer
5769 ++ * weight changes to the proper time instants (i.e.,
5770 ++ * when entity->finish <= old_st->vtime).
5771 ++ */
5772 ++ new_st = bfq_entity_service_tree(entity);
5773 ++
5774 ++ prev_weight = entity->weight;
5775 ++ new_weight = entity->orig_weight *
5776 ++ (bfqq != NULL ? bfqq->wr_coeff : 1);
5777 ++ /*
5778 ++ * If the weight of the entity changes, remove the entity
5779 ++ * from its old weight counter (if there is a counter
5780 ++ * associated with the entity), and add it to the counter
5781 ++ * associated with its new weight.
5782 ++ */
5783 ++ if (prev_weight != new_weight) {
5784 ++ root = bfqq ? &bfqd->queue_weights_tree :
5785 ++ &bfqd->group_weights_tree;
5786 ++ bfq_weights_tree_remove(bfqd, entity, root);
5787 ++ }
5788 ++ entity->weight = new_weight;
5789 ++ /*
5790 ++ * Add the entity to its weights tree only if it is
5791 ++ * not associated with a weight-raised queue.
5792 ++ */
5793 ++ if (prev_weight != new_weight &&
5794 ++ (bfqq ? bfqq->wr_coeff == 1 : 1))
5795 ++ /* If we get here, root has been initialized. */
5796 ++ bfq_weights_tree_add(bfqd, entity, root);
5797 ++
5798 ++ new_st->wsum += entity->weight;
5799 ++
5800 ++ if (new_st != old_st)
5801 ++ entity->start = new_st->vtime;
5802 ++ }
5803 ++
5804 ++ return new_st;
5805 ++}
5806 ++
5807 ++/**
5808 ++ * bfq_bfqq_served - update the scheduler status after selection for
5809 ++ * service.
5810 ++ * @bfqq: the queue being served.
5811 ++ * @served: bytes to transfer.
5812 ++ *
5813 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
5814 ++ * are synchronized every time a new bfqq is selected for service. By now,
5815 ++ * we keep it to better check consistency.
5816 ++ */
5817 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
5818 ++{
5819 ++ struct bfq_entity *entity = &bfqq->entity;
5820 ++ struct bfq_service_tree *st;
5821 ++
5822 ++ for_each_entity(entity) {
5823 ++ st = bfq_entity_service_tree(entity);
5824 ++
5825 ++ entity->service += served;
5826 ++ BUG_ON(entity->service > entity->budget);
5827 ++ BUG_ON(st->wsum == 0);
5828 ++
5829 ++ st->vtime += bfq_delta(served, st->wsum);
5830 ++ bfq_forget_idle(st);
5831 ++ }
5832 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
5833 ++}
5834 ++
5835 ++/**
5836 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
5837 ++ * @bfqq: the queue that needs a service update.
5838 ++ *
5839 ++ * When it's not possible to be fair in the service domain, because
5840 ++ * a queue is not consuming its budget fast enough (the meaning of
5841 ++ * fast depends on the timeout parameter), we charge it a full
5842 ++ * budget. In this way we should obtain a sort of time-domain
5843 ++ * fairness among all the seeky/slow queues.
5844 ++ */
5845 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
5846 ++{
5847 ++ struct bfq_entity *entity = &bfqq->entity;
5848 ++
5849 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
5850 ++
5851 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
5852 ++}
5853 ++
5854 ++/**
5855 ++ * __bfq_activate_entity - activate an entity.
5856 ++ * @entity: the entity being activated.
5857 ++ *
5858 ++ * Called whenever an entity is activated, i.e., it is not active and one
5859 ++ * of its children receives a new request, or has to be reactivated due to
5860 ++ * budget exhaustion. It uses the current budget of the entity (and the
5861 ++ * service received if @entity is active) of the queue to calculate its
5862 ++ * timestamps.
5863 ++ */
5864 ++static void __bfq_activate_entity(struct bfq_entity *entity)
5865 ++{
5866 ++ struct bfq_sched_data *sd = entity->sched_data;
5867 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5868 ++
5869 ++ if (entity == sd->in_service_entity) {
5870 ++ BUG_ON(entity->tree != NULL);
5871 ++ /*
5872 ++ * If we are requeueing the current entity we have
5873 ++ * to take care of not charging to it service it has
5874 ++ * not received.
5875 ++ */
5876 ++ bfq_calc_finish(entity, entity->service);
5877 ++ entity->start = entity->finish;
5878 ++ sd->in_service_entity = NULL;
5879 ++ } else if (entity->tree == &st->active) {
5880 ++ /*
5881 ++ * Requeueing an entity due to a change of some
5882 ++ * next_in_service entity below it. We reuse the
5883 ++ * old start time.
5884 ++ */
5885 ++ bfq_active_extract(st, entity);
5886 ++ } else if (entity->tree == &st->idle) {
5887 ++ /*
5888 ++ * Must be on the idle tree, bfq_idle_extract() will
5889 ++ * check for that.
5890 ++ */
5891 ++ bfq_idle_extract(st, entity);
5892 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
5893 ++ st->vtime : entity->finish;
5894 ++ } else {
5895 ++ /*
5896 ++ * The finish time of the entity may be invalid, and
5897 ++ * it is in the past for sure, otherwise the queue
5898 ++ * would have been on the idle tree.
5899 ++ */
5900 ++ entity->start = st->vtime;
5901 ++ st->wsum += entity->weight;
5902 ++ bfq_get_entity(entity);
5903 ++
5904 ++ BUG_ON(entity->on_st);
5905 ++ entity->on_st = 1;
5906 ++ }
5907 ++
5908 ++ st = __bfq_entity_update_weight_prio(st, entity);
5909 ++ bfq_calc_finish(entity, entity->budget);
5910 ++ bfq_active_insert(st, entity);
5911 ++}
5912 ++
5913 ++/**
5914 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5915 ++ * @entity: the entity to activate.
5916 ++ *
5917 ++ * Activate @entity and all the entities on the path from it to the root.
5918 ++ */
5919 ++static void bfq_activate_entity(struct bfq_entity *entity)
5920 ++{
5921 ++ struct bfq_sched_data *sd;
5922 ++
5923 ++ for_each_entity(entity) {
5924 ++ __bfq_activate_entity(entity);
5925 ++
5926 ++ sd = entity->sched_data;
5927 ++ if (!bfq_update_next_in_service(sd))
5928 ++ /*
5929 ++ * No need to propagate the activation to the
5930 ++ * upper entities, as they will be updated when
5931 ++ * the in-service entity is rescheduled.
5932 ++ */
5933 ++ break;
5934 ++ }
5935 ++}
5936 ++
5937 ++/**
5938 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5939 ++ * @entity: the entity to deactivate.
5940 ++ * @requeue: if false, the entity will not be put into the idle tree.
5941 ++ *
5942 ++ * Deactivate an entity, independently from its previous state. If the
5943 ++ * entity was not on a service tree just return, otherwise if it is on
5944 ++ * any scheduler tree, extract it from that tree, and if necessary
5945 ++ * and if the caller did not specify @requeue, put it on the idle tree.
5946 ++ *
5947 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
5948 ++ * if the entity was in service or if it was the next_in_service for
5949 ++ * its sched_data; return %0 otherwise.
5950 ++ */
5951 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5952 ++{
5953 ++ struct bfq_sched_data *sd = entity->sched_data;
5954 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5955 ++ int was_in_service = entity == sd->in_service_entity;
5956 ++ int ret = 0;
5957 ++
5958 ++ if (!entity->on_st)
5959 ++ return 0;
5960 ++
5961 ++ BUG_ON(was_in_service && entity->tree != NULL);
5962 ++
5963 ++ if (was_in_service) {
5964 ++ bfq_calc_finish(entity, entity->service);
5965 ++ sd->in_service_entity = NULL;
5966 ++ } else if (entity->tree == &st->active)
5967 ++ bfq_active_extract(st, entity);
5968 ++ else if (entity->tree == &st->idle)
5969 ++ bfq_idle_extract(st, entity);
5970 ++ else if (entity->tree != NULL)
5971 ++ BUG();
5972 ++
5973 ++ if (was_in_service || sd->next_in_service == entity)
5974 ++ ret = bfq_update_next_in_service(sd);
5975 ++
5976 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5977 ++ bfq_forget_entity(st, entity);
5978 ++ else
5979 ++ bfq_idle_insert(st, entity);
5980 ++
5981 ++ BUG_ON(sd->in_service_entity == entity);
5982 ++ BUG_ON(sd->next_in_service == entity);
5983 ++
5984 ++ return ret;
5985 ++}
5986 ++
5987 ++/**
5988 ++ * bfq_deactivate_entity - deactivate an entity.
5989 ++ * @entity: the entity to deactivate.
5990 ++ * @requeue: true if the entity can be put on the idle tree
5991 ++ */
5992 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5993 ++{
5994 ++ struct bfq_sched_data *sd;
5995 ++ struct bfq_entity *parent;
5996 ++
5997 ++ for_each_entity_safe(entity, parent) {
5998 ++ sd = entity->sched_data;
5999 ++
6000 ++ if (!__bfq_deactivate_entity(entity, requeue))
6001 ++ /*
6002 ++ * The parent entity is still backlogged, and
6003 ++ * we don't need to update it as it is still
6004 ++ * in service.
6005 ++ */
6006 ++ break;
6007 ++
6008 ++ if (sd->next_in_service != NULL)
6009 ++ /*
6010 ++ * The parent entity is still backlogged and
6011 ++ * the budgets on the path towards the root
6012 ++ * need to be updated.
6013 ++ */
6014 ++ goto update;
6015 ++
6016 ++ /*
6017 ++ * If we reach there the parent is no more backlogged and
6018 ++ * we want to propagate the dequeue upwards.
6019 ++ */
6020 ++ requeue = 1;
6021 ++ }
6022 ++
6023 ++ return;
6024 ++
6025 ++update:
6026 ++ entity = parent;
6027 ++ for_each_entity(entity) {
6028 ++ __bfq_activate_entity(entity);
6029 ++
6030 ++ sd = entity->sched_data;
6031 ++ if (!bfq_update_next_in_service(sd))
6032 ++ break;
6033 ++ }
6034 ++}
6035 ++
6036 ++/**
6037 ++ * bfq_update_vtime - update vtime if necessary.
6038 ++ * @st: the service tree to act upon.
6039 ++ *
6040 ++ * If necessary update the service tree vtime to have at least one
6041 ++ * eligible entity, skipping to its start time. Assumes that the
6042 ++ * active tree of the device is not empty.
6043 ++ *
6044 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
6045 ++ * we may end up with reactivated processes getting timestamps after a
6046 ++ * vtime skip done because we needed a ->first_active entity on some
6047 ++ * intermediate node.
6048 ++ */
6049 ++static void bfq_update_vtime(struct bfq_service_tree *st)
6050 ++{
6051 ++ struct bfq_entity *entry;
6052 ++ struct rb_node *node = st->active.rb_node;
6053 ++
6054 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
6055 ++ if (bfq_gt(entry->min_start, st->vtime)) {
6056 ++ st->vtime = entry->min_start;
6057 ++ bfq_forget_idle(st);
6058 ++ }
6059 ++}
6060 ++
6061 ++/**
6062 ++ * bfq_first_active_entity - find the eligible entity with
6063 ++ * the smallest finish time
6064 ++ * @st: the service tree to select from.
6065 ++ *
6066 ++ * This function searches the first schedulable entity, starting from the
6067 ++ * root of the tree and going on the left every time on this side there is
6068 ++ * a subtree with at least one eligible (start >= vtime) entity. The path on
6069 ++ * the right is followed only if a) the left subtree contains no eligible
6070 ++ * entities and b) no eligible entity has been found yet.
6071 ++ */
6072 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
6073 ++{
6074 ++ struct bfq_entity *entry, *first = NULL;
6075 ++ struct rb_node *node = st->active.rb_node;
6076 ++
6077 ++ while (node != NULL) {
6078 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
6079 ++left:
6080 ++ if (!bfq_gt(entry->start, st->vtime))
6081 ++ first = entry;
6082 ++
6083 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
6084 ++
6085 ++ if (node->rb_left != NULL) {
6086 ++ entry = rb_entry(node->rb_left,
6087 ++ struct bfq_entity, rb_node);
6088 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
6089 ++ node = node->rb_left;
6090 ++ goto left;
6091 ++ }
6092 ++ }
6093 ++ if (first != NULL)
6094 ++ break;
6095 ++ node = node->rb_right;
6096 ++ }
6097 ++
6098 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
6099 ++ return first;
6100 ++}
6101 ++
6102 ++/**
6103 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
6104 ++ * @st: the service tree.
6105 ++ *
6106 ++ * Update the virtual time in @st and return the first eligible entity
6107 ++ * it contains.
6108 ++ */
6109 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
6110 ++ bool force)
6111 ++{
6112 ++ struct bfq_entity *entity, *new_next_in_service = NULL;
6113 ++
6114 ++ if (RB_EMPTY_ROOT(&st->active))
6115 ++ return NULL;
6116 ++
6117 ++ bfq_update_vtime(st);
6118 ++ entity = bfq_first_active_entity(st);
6119 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
6120 ++
6121 ++ /*
6122 ++ * If the chosen entity does not match with the sched_data's
6123 ++ * next_in_service and we are forcedly serving the IDLE priority
6124 ++ * class tree, bubble up budget update.
6125 ++ */
6126 ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
6127 ++ new_next_in_service = entity;
6128 ++ for_each_entity(new_next_in_service)
6129 ++ bfq_update_budget(new_next_in_service);
6130 ++ }
6131 ++
6132 ++ return entity;
6133 ++}
6134 ++
6135 ++/**
6136 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
6137 ++ * @sd: the sched_data.
6138 ++ * @extract: if true the returned entity will be also extracted from @sd.
6139 ++ *
6140 ++ * NOTE: since we cache the next_in_service entity at each level of the
6141 ++ * hierarchy, the complexity of the lookup can be decreased with
6142 ++ * absolutely no effort just returning the cached next_in_service value;
6143 ++ * we prefer to do full lookups to test the consistency of * the data
6144 ++ * structures.
6145 ++ */
6146 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
6147 ++ int extract,
6148 ++ struct bfq_data *bfqd)
6149 ++{
6150 ++ struct bfq_service_tree *st = sd->service_tree;
6151 ++ struct bfq_entity *entity;
6152 ++ int i = 0;
6153 ++
6154 ++ BUG_ON(sd->in_service_entity != NULL);
6155 ++
6156 ++ if (bfqd != NULL &&
6157 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
6158 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
6159 ++ true);
6160 ++ if (entity != NULL) {
6161 ++ i = BFQ_IOPRIO_CLASSES - 1;
6162 ++ bfqd->bfq_class_idle_last_service = jiffies;
6163 ++ sd->next_in_service = entity;
6164 ++ }
6165 ++ }
6166 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
6167 ++ entity = __bfq_lookup_next_entity(st + i, false);
6168 ++ if (entity != NULL) {
6169 ++ if (extract) {
6170 ++ bfq_check_next_in_service(sd, entity);
6171 ++ bfq_active_extract(st + i, entity);
6172 ++ sd->in_service_entity = entity;
6173 ++ sd->next_in_service = NULL;
6174 ++ }
6175 ++ break;
6176 ++ }
6177 ++ }
6178 ++
6179 ++ return entity;
6180 ++}
6181 ++
6182 ++/*
6183 ++ * Get next queue for service.
6184 ++ */
6185 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
6186 ++{
6187 ++ struct bfq_entity *entity = NULL;
6188 ++ struct bfq_sched_data *sd;
6189 ++ struct bfq_queue *bfqq;
6190 ++
6191 ++ BUG_ON(bfqd->in_service_queue != NULL);
6192 ++
6193 ++ if (bfqd->busy_queues == 0)
6194 ++ return NULL;
6195 ++
6196 ++ sd = &bfqd->root_group->sched_data;
6197 ++ for (; sd != NULL; sd = entity->my_sched_data) {
6198 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
6199 ++ BUG_ON(entity == NULL);
6200 ++ entity->service = 0;
6201 ++ }
6202 ++
6203 ++ bfqq = bfq_entity_to_bfqq(entity);
6204 ++ BUG_ON(bfqq == NULL);
6205 ++
6206 ++ return bfqq;
6207 ++}
6208 ++
6209 ++/*
6210 ++ * Forced extraction of the given queue.
6211 ++ */
6212 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
6213 ++ struct bfq_queue *bfqq)
6214 ++{
6215 ++ struct bfq_entity *entity;
6216 ++ struct bfq_sched_data *sd;
6217 ++
6218 ++ BUG_ON(bfqd->in_service_queue != NULL);
6219 ++
6220 ++ entity = &bfqq->entity;
6221 ++ /*
6222 ++ * Bubble up extraction/update from the leaf to the root.
6223 ++ */
6224 ++ for_each_entity(entity) {
6225 ++ sd = entity->sched_data;
6226 ++ bfq_update_budget(entity);
6227 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
6228 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
6229 ++ sd->in_service_entity = entity;
6230 ++ sd->next_in_service = NULL;
6231 ++ entity->service = 0;
6232 ++ }
6233 ++
6234 ++ return;
6235 ++}
6236 ++
6237 ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
6238 ++{
6239 ++ if (bfqd->in_service_bic != NULL) {
6240 ++ put_io_context(bfqd->in_service_bic->icq.ioc);
6241 ++ bfqd->in_service_bic = NULL;
6242 ++ }
6243 ++
6244 ++ bfqd->in_service_queue = NULL;
6245 ++ del_timer(&bfqd->idle_slice_timer);
6246 ++}
6247 ++
6248 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6249 ++ int requeue)
6250 ++{
6251 ++ struct bfq_entity *entity = &bfqq->entity;
6252 ++
6253 ++ if (bfqq == bfqd->in_service_queue)
6254 ++ __bfq_bfqd_reset_in_service(bfqd);
6255 ++
6256 ++ bfq_deactivate_entity(entity, requeue);
6257 ++}
6258 ++
6259 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6260 ++{
6261 ++ struct bfq_entity *entity = &bfqq->entity;
6262 ++
6263 ++ bfq_activate_entity(entity);
6264 ++}
6265 ++
6266 ++/*
6267 ++ * Called when the bfqq no longer has requests pending, remove it from
6268 ++ * the service tree.
6269 ++ */
6270 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6271 ++ int requeue)
6272 ++{
6273 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
6274 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
6275 ++
6276 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
6277 ++
6278 ++ bfq_clear_bfqq_busy(bfqq);
6279 ++
6280 ++ BUG_ON(bfqd->busy_queues == 0);
6281 ++ bfqd->busy_queues--;
6282 ++
6283 ++ if (!bfqq->dispatched) {
6284 ++ bfq_weights_tree_remove(bfqd, &bfqq->entity,
6285 ++ &bfqd->queue_weights_tree);
6286 ++ if (!blk_queue_nonrot(bfqd->queue)) {
6287 ++ BUG_ON(!bfqd->busy_in_flight_queues);
6288 ++ bfqd->busy_in_flight_queues--;
6289 ++ if (bfq_bfqq_constantly_seeky(bfqq)) {
6290 ++ BUG_ON(!bfqd->
6291 ++ const_seeky_busy_in_flight_queues);
6292 ++ bfqd->const_seeky_busy_in_flight_queues--;
6293 ++ }
6294 ++ }
6295 ++ }
6296 ++ if (bfqq->wr_coeff > 1)
6297 ++ bfqd->wr_busy_queues--;
6298 ++
6299 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
6300 ++}
6301 ++
6302 ++/*
6303 ++ * Called when an inactive queue receives a new request.
6304 ++ */
6305 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6306 ++{
6307 ++ BUG_ON(bfq_bfqq_busy(bfqq));
6308 ++ BUG_ON(bfqq == bfqd->in_service_queue);
6309 ++
6310 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
6311 ++
6312 ++ bfq_activate_bfqq(bfqd, bfqq);
6313 ++
6314 ++ bfq_mark_bfqq_busy(bfqq);
6315 ++ bfqd->busy_queues++;
6316 ++
6317 ++ if (!bfqq->dispatched) {
6318 ++ if (bfqq->wr_coeff == 1)
6319 ++ bfq_weights_tree_add(bfqd, &bfqq->entity,
6320 ++ &bfqd->queue_weights_tree);
6321 ++ if (!blk_queue_nonrot(bfqd->queue)) {
6322 ++ bfqd->busy_in_flight_queues++;
6323 ++ if (bfq_bfqq_constantly_seeky(bfqq))
6324 ++ bfqd->const_seeky_busy_in_flight_queues++;
6325 ++ }
6326 ++ }
6327 ++ if (bfqq->wr_coeff > 1)
6328 ++ bfqd->wr_busy_queues++;
6329 ++}
6330 +diff --git a/block/bfq.h b/block/bfq.h
6331 +new file mode 100644
6332 +index 0000000..e350b5f
6333 +--- /dev/null
6334 ++++ b/block/bfq.h
6335 +@@ -0,0 +1,771 @@
6336 ++/*
6337 ++ * BFQ-v7r8 for 4.3.0: data structures and common functions prototypes.
6338 ++ *
6339 ++ * Based on ideas and code from CFQ:
6340 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
6341 ++ *
6342 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
6343 ++ * Paolo Valente <paolo.valente@×××××××.it>
6344 ++ *
6345 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
6346 ++ */
6347 ++
6348 ++#ifndef _BFQ_H
6349 ++#define _BFQ_H
6350 ++
6351 ++#include <linux/blktrace_api.h>
6352 ++#include <linux/hrtimer.h>
6353 ++#include <linux/ioprio.h>
6354 ++#include <linux/rbtree.h>
6355 ++
6356 ++#define BFQ_IOPRIO_CLASSES 3
6357 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
6358 ++
6359 ++#define BFQ_MIN_WEIGHT 1
6360 ++#define BFQ_MAX_WEIGHT 1000
6361 ++
6362 ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4
6363 ++
6364 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
6365 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
6366 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
6367 ++
6368 ++struct bfq_entity;
6369 ++
6370 ++/**
6371 ++ * struct bfq_service_tree - per ioprio_class service tree.
6372 ++ * @active: tree for active entities (i.e., those backlogged).
6373 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
6374 ++ * @first_idle: idle entity with minimum F_i.
6375 ++ * @last_idle: idle entity with maximum F_i.
6376 ++ * @vtime: scheduler virtual time.
6377 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
6378 ++ *
6379 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
6380 ++ * ioprio_class has its own independent scheduler, and so its own
6381 ++ * bfq_service_tree. All the fields are protected by the queue lock
6382 ++ * of the containing bfqd.
6383 ++ */
6384 ++struct bfq_service_tree {
6385 ++ struct rb_root active;
6386 ++ struct rb_root idle;
6387 ++
6388 ++ struct bfq_entity *first_idle;
6389 ++ struct bfq_entity *last_idle;
6390 ++
6391 ++ u64 vtime;
6392 ++ unsigned long wsum;
6393 ++};
6394 ++
6395 ++/**
6396 ++ * struct bfq_sched_data - multi-class scheduler.
6397 ++ * @in_service_entity: entity in service.
6398 ++ * @next_in_service: head-of-the-line entity in the scheduler.
6399 ++ * @service_tree: array of service trees, one per ioprio_class.
6400 ++ *
6401 ++ * bfq_sched_data is the basic scheduler queue. It supports three
6402 ++ * ioprio_classes, and can be used either as a toplevel queue or as
6403 ++ * an intermediate queue on a hierarchical setup.
6404 ++ * @next_in_service points to the active entity of the sched_data
6405 ++ * service trees that will be scheduled next.
6406 ++ *
6407 ++ * The supported ioprio_classes are the same as in CFQ, in descending
6408 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
6409 ++ * Requests from higher priority queues are served before all the
6410 ++ * requests from lower priority queues; among requests of the same
6411 ++ * queue requests are served according to B-WF2Q+.
6412 ++ * All the fields are protected by the queue lock of the containing bfqd.
6413 ++ */
6414 ++struct bfq_sched_data {
6415 ++ struct bfq_entity *in_service_entity;
6416 ++ struct bfq_entity *next_in_service;
6417 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
6418 ++};
6419 ++
6420 ++/**
6421 ++ * struct bfq_weight_counter - counter of the number of all active entities
6422 ++ * with a given weight.
6423 ++ * @weight: weight of the entities that this counter refers to.
6424 ++ * @num_active: number of active entities with this weight.
6425 ++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
6426 ++ * and @group_weights_tree).
6427 ++ */
6428 ++struct bfq_weight_counter {
6429 ++ short int weight;
6430 ++ unsigned int num_active;
6431 ++ struct rb_node weights_node;
6432 ++};
6433 ++
6434 ++/**
6435 ++ * struct bfq_entity - schedulable entity.
6436 ++ * @rb_node: service_tree member.
6437 ++ * @weight_counter: pointer to the weight counter associated with this entity.
6438 ++ * @on_st: flag, true if the entity is on a tree (either the active or
6439 ++ * the idle one of its service_tree).
6440 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
6441 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
6442 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
6443 ++ * @min_start: minimum start time of the (active) subtree rooted at
6444 ++ * this entity; used for O(log N) lookups into active trees.
6445 ++ * @service: service received during the last round of service.
6446 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
6447 ++ * @weight: weight of the queue
6448 ++ * @parent: parent entity, for hierarchical scheduling.
6449 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
6450 ++ * associated scheduler queue, %NULL on leaf nodes.
6451 ++ * @sched_data: the scheduler queue this entity belongs to.
6452 ++ * @ioprio: the ioprio in use.
6453 ++ * @new_weight: when a weight change is requested, the new weight value.
6454 ++ * @orig_weight: original weight, used to implement weight boosting
6455 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
6456 ++ * @ioprio_class: the ioprio_class in use.
6457 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
6458 ++ * ioprio_class value.
6459 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
6460 ++ * ioprio_class change.
6461 ++ *
6462 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
6463 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
6464 ++ * entity belongs to the sched_data of the parent group in the cgroup
6465 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
6466 ++ * in @my_sched_data.
6467 ++ *
6468 ++ * Each entity stores independently its priority values; this would
6469 ++ * allow different weights on different devices, but this
6470 ++ * functionality is not exported to userspace by now. Priorities and
6471 ++ * weights are updated lazily, first storing the new values into the
6472 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
6473 ++ * there is a transition in the entity state that allows the priority
6474 ++ * update to take place the effective and the requested priority
6475 ++ * values are synchronized.
6476 ++ *
6477 ++ * Unless cgroups are used, the weight value is calculated from the
6478 ++ * ioprio to export the same interface as CFQ. When dealing with
6479 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
6480 ++ * time to consume their budget and have true sequential behavior, and
6481 ++ * when there are no external factors breaking anticipation) the
6482 ++ * relative weights at each level of the cgroups hierarchy should be
6483 ++ * guaranteed. All the fields are protected by the queue lock of the
6484 ++ * containing bfqd.
6485 ++ */
6486 ++struct bfq_entity {
6487 ++ struct rb_node rb_node;
6488 ++ struct bfq_weight_counter *weight_counter;
6489 ++
6490 ++ int on_st;
6491 ++
6492 ++ u64 finish;
6493 ++ u64 start;
6494 ++
6495 ++ struct rb_root *tree;
6496 ++
6497 ++ u64 min_start;
6498 ++
6499 ++ unsigned long service, budget;
6500 ++ unsigned short weight, new_weight;
6501 ++ unsigned short orig_weight;
6502 ++
6503 ++ struct bfq_entity *parent;
6504 ++
6505 ++ struct bfq_sched_data *my_sched_data;
6506 ++ struct bfq_sched_data *sched_data;
6507 ++
6508 ++ unsigned short ioprio, new_ioprio;
6509 ++ unsigned short ioprio_class, new_ioprio_class;
6510 ++
6511 ++ int ioprio_changed;
6512 ++};
6513 ++
6514 ++struct bfq_group;
6515 ++
6516 ++/**
6517 ++ * struct bfq_queue - leaf schedulable entity.
6518 ++ * @ref: reference counter.
6519 ++ * @bfqd: parent bfq_data.
6520 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
6521 ++ * one or more other queues.
6522 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
6523 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
6524 ++ * @sort_list: sorted list of pending requests.
6525 ++ * @next_rq: if fifo isn't expired, next request to serve.
6526 ++ * @queued: nr of requests queued in @sort_list.
6527 ++ * @allocated: currently allocated requests.
6528 ++ * @meta_pending: pending metadata requests.
6529 ++ * @fifo: fifo list of requests in sort_list.
6530 ++ * @entity: entity representing this queue in the scheduler.
6531 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
6532 ++ * @budget_timeout: budget expiration (in jiffies).
6533 ++ * @dispatched: number of requests on the dispatch list or inside driver.
6534 ++ * @flags: status flags.
6535 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
6536 ++ * @burst_list_node: node for the device's burst list.
6537 ++ * @seek_samples: number of seeks sampled
6538 ++ * @seek_total: sum of the distances of the seeks sampled
6539 ++ * @seek_mean: mean seek distance
6540 ++ * @last_request_pos: position of the last request enqueued
6541 ++ * @requests_within_timer: number of consecutive pairs of request completion
6542 ++ * and arrival, such that the queue becomes idle
6543 ++ * after the completion, but the next request arrives
6544 ++ * within an idle time slice; used only if the queue's
6545 ++ * IO_bound has been cleared.
6546 ++ * @pid: pid of the process owning the queue, used for logging purposes.
6547 ++ * @last_wr_start_finish: start time of the current weight-raising period if
6548 ++ * the @bfq-queue is being weight-raised, otherwise
6549 ++ * finish time of the last weight-raising period
6550 ++ * @wr_cur_max_time: current max raising time for this queue
6551 ++ * @soft_rt_next_start: minimum time instant such that, only if a new
6552 ++ * request is enqueued after this time instant in an
6553 ++ * idle @bfq_queue with no outstanding requests, then
6554 ++ * the task associated with the queue it is deemed as
6555 ++ * soft real-time (see the comments to the function
6556 ++ * bfq_bfqq_softrt_next_start()).
6557 ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
6558 ++ * idle to backlogged
6559 ++ * @service_from_backlogged: cumulative service received from the @bfq_queue
6560 ++ * since the last transition from idle to
6561 ++ * backlogged
6562 ++ *
6563 ++ * A bfq_queue is a leaf request queue; it can be associated with an io_context
6564 ++ * or more, if it is async or shared between cooperating processes. @cgroup
6565 ++ * holds a reference to the cgroup, to be sure that it does not disappear while
6566 ++ * a bfqq still references it (mostly to avoid races between request issuing and
6567 ++ * task migration followed by cgroup destruction).
6568 ++ * All the fields are protected by the queue lock of the containing bfqd.
6569 ++ */
6570 ++struct bfq_queue {
6571 ++ atomic_t ref;
6572 ++ struct bfq_data *bfqd;
6573 ++
6574 ++ /* fields for cooperating queues handling */
6575 ++ struct bfq_queue *new_bfqq;
6576 ++ struct rb_node pos_node;
6577 ++ struct rb_root *pos_root;
6578 ++
6579 ++ struct rb_root sort_list;
6580 ++ struct request *next_rq;
6581 ++ int queued[2];
6582 ++ int allocated[2];
6583 ++ int meta_pending;
6584 ++ struct list_head fifo;
6585 ++
6586 ++ struct bfq_entity entity;
6587 ++
6588 ++ unsigned long max_budget;
6589 ++ unsigned long budget_timeout;
6590 ++
6591 ++ int dispatched;
6592 ++
6593 ++ unsigned int flags;
6594 ++
6595 ++ struct list_head bfqq_list;
6596 ++
6597 ++ struct hlist_node burst_list_node;
6598 ++
6599 ++ unsigned int seek_samples;
6600 ++ u64 seek_total;
6601 ++ sector_t seek_mean;
6602 ++ sector_t last_request_pos;
6603 ++
6604 ++ unsigned int requests_within_timer;
6605 ++
6606 ++ pid_t pid;
6607 ++
6608 ++ /* weight-raising fields */
6609 ++ unsigned long wr_cur_max_time;
6610 ++ unsigned long soft_rt_next_start;
6611 ++ unsigned long last_wr_start_finish;
6612 ++ unsigned int wr_coeff;
6613 ++ unsigned long last_idle_bklogged;
6614 ++ unsigned long service_from_backlogged;
6615 ++};
6616 ++
6617 ++/**
6618 ++ * struct bfq_ttime - per process thinktime stats.
6619 ++ * @ttime_total: total process thinktime
6620 ++ * @ttime_samples: number of thinktime samples
6621 ++ * @ttime_mean: average process thinktime
6622 ++ */
6623 ++struct bfq_ttime {
6624 ++ unsigned long last_end_request;
6625 ++
6626 ++ unsigned long ttime_total;
6627 ++ unsigned long ttime_samples;
6628 ++ unsigned long ttime_mean;
6629 ++};
6630 ++
6631 ++/**
6632 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
6633 ++ * @icq: associated io_cq structure
6634 ++ * @bfqq: array of two process queues, the sync and the async
6635 ++ * @ttime: associated @bfq_ttime struct
6636 ++ */
6637 ++struct bfq_io_cq {
6638 ++ struct io_cq icq; /* must be the first member */
6639 ++ struct bfq_queue *bfqq[2];
6640 ++ struct bfq_ttime ttime;
6641 ++ int ioprio;
6642 ++};
6643 ++
6644 ++enum bfq_device_speed {
6645 ++ BFQ_BFQD_FAST,
6646 ++ BFQ_BFQD_SLOW,
6647 ++};
6648 ++
6649 ++/**
6650 ++ * struct bfq_data - per device data structure.
6651 ++ * @queue: request queue for the managed device.
6652 ++ * @root_group: root bfq_group for the device.
6653 ++ * @rq_pos_tree: rbtree sorted by next_request position, used when
6654 ++ * determining if two or more queues have interleaving
6655 ++ * requests (see bfq_close_cooperator()).
6656 ++ * @active_numerous_groups: number of bfq_groups containing more than one
6657 ++ * active @bfq_entity.
6658 ++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
6659 ++ * weight. Used to keep track of whether all @bfq_queues
6660 ++ * have the same weight. The tree contains one counter
6661 ++ * for each distinct weight associated to some active
6662 ++ * and not weight-raised @bfq_queue (see the comments to
6663 ++ * the functions bfq_weights_tree_[add|remove] for
6664 ++ * further details).
6665 ++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
6666 ++ * by weight. Used to keep track of whether all
6667 ++ * @bfq_groups have the same weight. The tree contains
6668 ++ * one counter for each distinct weight associated to
6669 ++ * some active @bfq_group (see the comments to the
6670 ++ * functions bfq_weights_tree_[add|remove] for further
6671 ++ * details).
6672 ++ * @busy_queues: number of bfq_queues containing requests (including the
6673 ++ * queue in service, even if it is idling).
6674 ++ * @busy_in_flight_queues: number of @bfq_queues containing pending or
6675 ++ * in-flight requests, plus the @bfq_queue in
6676 ++ * service, even if idle but waiting for the
6677 ++ * possible arrival of its next sync request. This
6678 ++ * field is updated only if the device is rotational,
6679 ++ * but used only if the device is also NCQ-capable.
6680 ++ * The reason why the field is updated also for non-
6681 ++ * NCQ-capable rotational devices is related to the
6682 ++ * fact that the value of @hw_tag may be set also
6683 ++ * later than when busy_in_flight_queues may need to
6684 ++ * be incremented for the first time(s). Taking also
6685 ++ * this possibility into account, to avoid unbalanced
6686 ++ * increments/decrements, would imply more overhead
6687 ++ * than just updating busy_in_flight_queues
6688 ++ * regardless of the value of @hw_tag.
6689 ++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
6690 ++ * (that is, seeky queues that expired
6691 ++ * for budget timeout at least once)
6692 ++ * containing pending or in-flight
6693 ++ * requests, including the in-service
6694 ++ * @bfq_queue if constantly seeky. This
6695 ++ * field is updated only if the device
6696 ++ * is rotational, but used only if the
6697 ++ * device is also NCQ-capable (see the
6698 ++ * comments to @busy_in_flight_queues).
6699 ++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.
6700 ++ * @queued: number of queued requests.
6701 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
6702 ++ * @sync_flight: number of sync requests in the driver.
6703 ++ * @max_rq_in_driver: max number of reqs in driver in the last
6704 ++ * @hw_tag_samples completed requests.
6705 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
6706 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
6707 ++ * @budgets_assigned: number of budgets assigned.
6708 ++ * @idle_slice_timer: timer set when idling for the next sequential request
6709 ++ * from the queue in service.
6710 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
6711 ++ * @in_service_queue: bfq_queue in service.
6712 ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
6713 ++ * @last_position: on-disk position of the last served request.
6714 ++ * @last_budget_start: beginning of the last budget.
6715 ++ * @last_idling_start: beginning of the last idle slice.
6716 ++ * @peak_rate: peak transfer rate observed for a budget.
6717 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
6718 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before
6719 ++ * rescheduling.
6720 ++ * @group_list: list of all the bfq_groups active on the device.
6721 ++ * @active_list: list of all the bfq_queues active on the device.
6722 ++ * @idle_list: list of all the bfq_queues idle on the device.
6723 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
6724 ++ * requests are served in fifo order.
6725 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
6726 ++ * @bfq_back_max: maximum allowed backward seek.
6727 ++ * @bfq_slice_idle: maximum idling time.
6728 ++ * @bfq_user_max_budget: user-configured max budget value
6729 ++ * (0 for auto-tuning).
6730 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
6731 ++ * async queues.
6732 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
6733 ++ * to prevent seeky queues to impose long latencies to well
6734 ++ * behaved ones (this also implies that seeky queues cannot
6735 ++ * receive guarantees in the service domain; after a timeout
6736 ++ * they are charged for the whole allocated budget, to try
6737 ++ * to preserve a behavior reasonably fair among them, but
6738 ++ * without service-domain guarantees).
6739 ++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
6740 ++ * no more granted any weight-raising.
6741 ++ * @bfq_failed_cooperations: number of consecutive failed cooperation
6742 ++ * chances after which weight-raising is restored
6743 ++ * to a queue subject to more than bfq_coop_thresh
6744 ++ * queue merges.
6745 ++ * @bfq_requests_within_timer: number of consecutive requests that must be
6746 ++ * issued within the idle time slice to set
6747 ++ * again idling to a queue which was marked as
6748 ++ * non-I/O-bound (see the definition of the
6749 ++ * IO_bound flag for further details).
6750 ++ * @last_ins_in_burst: last time at which a queue entered the current
6751 ++ * burst of queues being activated shortly after
6752 ++ * each other; for more details about this and the
6753 ++ * following parameters related to a burst of
6754 ++ * activations, see the comments to the function
6755 ++ * @bfq_handle_burst.
6756 ++ * @bfq_burst_interval: reference time interval used to decide whether a
6757 ++ * queue has been activated shortly after
6758 ++ * @last_ins_in_burst.
6759 ++ * @burst_size: number of queues in the current burst of queue activations.
6760 ++ * @bfq_large_burst_thresh: maximum burst size above which the current
6761 ++ * queue-activation burst is deemed as 'large'.
6762 ++ * @large_burst: true if a large queue-activation burst is in progress.
6763 ++ * @burst_list: head of the burst list (as for the above fields, more details
6764 ++ * in the comments to the function bfq_handle_burst).
6765 ++ * @low_latency: if set to true, low-latency heuristics are enabled.
6766 ++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
6767 ++ * queue is multiplied.
6768 ++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).
6769 ++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.
6770 ++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
6771 ++ * may be reactivated for a queue (in jiffies).
6772 ++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
6773 ++ * after which weight-raising may be
6774 ++ * reactivated for an already busy queue
6775 ++ * (in jiffies).
6776 ++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
6777 ++ * sectors per seconds.
6778 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
6779 ++ * duration of the weight raising automatically.
6780 ++ * @device_speed: device-speed class for the low-latency heuristic.
6781 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.
6782 ++ *
6783 ++ * All the fields are protected by the @queue lock.
6784 ++ */
6785 ++struct bfq_data {
6786 ++ struct request_queue *queue;
6787 ++
6788 ++ struct bfq_group *root_group;
6789 ++ struct rb_root rq_pos_tree;
6790 ++
6791 ++#ifdef CONFIG_CGROUP_BFQIO
6792 ++ int active_numerous_groups;
6793 ++#endif
6794 ++
6795 ++ struct rb_root queue_weights_tree;
6796 ++ struct rb_root group_weights_tree;
6797 ++
6798 ++ int busy_queues;
6799 ++ int busy_in_flight_queues;
6800 ++ int const_seeky_busy_in_flight_queues;
6801 ++ int wr_busy_queues;
6802 ++ int queued;
6803 ++ int rq_in_driver;
6804 ++ int sync_flight;
6805 ++
6806 ++ int max_rq_in_driver;
6807 ++ int hw_tag_samples;
6808 ++ int hw_tag;
6809 ++
6810 ++ int budgets_assigned;
6811 ++
6812 ++ struct timer_list idle_slice_timer;
6813 ++ struct work_struct unplug_work;
6814 ++
6815 ++ struct bfq_queue *in_service_queue;
6816 ++ struct bfq_io_cq *in_service_bic;
6817 ++
6818 ++ sector_t last_position;
6819 ++
6820 ++ ktime_t last_budget_start;
6821 ++ ktime_t last_idling_start;
6822 ++ int peak_rate_samples;
6823 ++ u64 peak_rate;
6824 ++ unsigned long bfq_max_budget;
6825 ++
6826 ++ struct hlist_head group_list;
6827 ++ struct list_head active_list;
6828 ++ struct list_head idle_list;
6829 ++
6830 ++ unsigned int bfq_fifo_expire[2];
6831 ++ unsigned int bfq_back_penalty;
6832 ++ unsigned int bfq_back_max;
6833 ++ unsigned int bfq_slice_idle;
6834 ++ u64 bfq_class_idle_last_service;
6835 ++
6836 ++ unsigned int bfq_user_max_budget;
6837 ++ unsigned int bfq_max_budget_async_rq;
6838 ++ unsigned int bfq_timeout[2];
6839 ++
6840 ++ unsigned int bfq_coop_thresh;
6841 ++ unsigned int bfq_failed_cooperations;
6842 ++ unsigned int bfq_requests_within_timer;
6843 ++
6844 ++ unsigned long last_ins_in_burst;
6845 ++ unsigned long bfq_burst_interval;
6846 ++ int burst_size;
6847 ++ unsigned long bfq_large_burst_thresh;
6848 ++ bool large_burst;
6849 ++ struct hlist_head burst_list;
6850 ++
6851 ++ bool low_latency;
6852 ++
6853 ++ /* parameters of the low_latency heuristics */
6854 ++ unsigned int bfq_wr_coeff;
6855 ++ unsigned int bfq_wr_max_time;
6856 ++ unsigned int bfq_wr_rt_max_time;
6857 ++ unsigned int bfq_wr_min_idle_time;
6858 ++ unsigned long bfq_wr_min_inter_arr_async;
6859 ++ unsigned int bfq_wr_max_softrt_rate;
6860 ++ u64 RT_prod;
6861 ++ enum bfq_device_speed device_speed;
6862 ++
6863 ++ struct bfq_queue oom_bfqq;
6864 ++};
6865 ++
6866 ++enum bfqq_state_flags {
6867 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */
6868 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
6869 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
6870 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
6871 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
6872 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
6873 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
6874 ++ BFQ_BFQQ_FLAG_IO_bound, /*
6875 ++ * bfqq has timed-out at least once
6876 ++ * having consumed at most 2/10 of
6877 ++ * its budget
6878 ++ */
6879 ++ BFQ_BFQQ_FLAG_in_large_burst, /*
6880 ++ * bfqq activated in a large burst,
6881 ++ * see comments to bfq_handle_burst.
6882 ++ */
6883 ++ BFQ_BFQQ_FLAG_constantly_seeky, /*
6884 ++ * bfqq has proved to be slow and
6885 ++ * seeky until budget timeout
6886 ++ */
6887 ++ BFQ_BFQQ_FLAG_softrt_update, /*
6888 ++ * may need softrt-next-start
6889 ++ * update
6890 ++ */
6891 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6892 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6893 ++};
6894 ++
6895 ++#define BFQ_BFQQ_FNS(name) \
6896 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
6897 ++{ \
6898 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
6899 ++} \
6900 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
6901 ++{ \
6902 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
6903 ++} \
6904 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
6905 ++{ \
6906 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
6907 ++}
6908 ++
6909 ++BFQ_BFQQ_FNS(busy);
6910 ++BFQ_BFQQ_FNS(wait_request);
6911 ++BFQ_BFQQ_FNS(must_alloc);
6912 ++BFQ_BFQQ_FNS(fifo_expire);
6913 ++BFQ_BFQQ_FNS(idle_window);
6914 ++BFQ_BFQQ_FNS(sync);
6915 ++BFQ_BFQQ_FNS(budget_new);
6916 ++BFQ_BFQQ_FNS(IO_bound);
6917 ++BFQ_BFQQ_FNS(in_large_burst);
6918 ++BFQ_BFQQ_FNS(constantly_seeky);
6919 ++BFQ_BFQQ_FNS(coop);
6920 ++BFQ_BFQQ_FNS(split_coop);
6921 ++BFQ_BFQQ_FNS(softrt_update);
6922 ++#undef BFQ_BFQQ_FNS
6923 ++
6924 ++/* Logging facilities. */
6925 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
6926 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
6927 ++
6928 ++#define bfq_log(bfqd, fmt, args...) \
6929 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
6930 ++
6931 ++/* Expiration reasons. */
6932 ++enum bfqq_expiration {
6933 ++ BFQ_BFQQ_TOO_IDLE = 0, /*
6934 ++ * queue has been idling for
6935 ++ * too long
6936 ++ */
6937 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
6938 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
6939 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
6940 ++};
6941 ++
6942 ++#ifdef CONFIG_CGROUP_BFQIO
6943 ++/**
6944 ++ * struct bfq_group - per (device, cgroup) data structure.
6945 ++ * @entity: schedulable entity to insert into the parent group sched_data.
6946 ++ * @sched_data: own sched_data, to contain child entities (they may be
6947 ++ * both bfq_queues and bfq_groups).
6948 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
6949 ++ * list of the containing cgroup's bfqio_cgroup.
6950 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
6951 ++ * of the groups active on the same device; used for cleanup.
6952 ++ * @bfqd: the bfq_data for the device this group acts upon.
6953 ++ * @async_bfqq: array of async queues for all the tasks belonging to
6954 ++ * the group, one queue per ioprio value per ioprio_class,
6955 ++ * except for the idle class that has only one queue.
6956 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
6957 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
6958 ++ * to avoid too many special cases during group creation/
6959 ++ * migration.
6960 ++ * @active_entities: number of active entities belonging to the group;
6961 ++ * unused for the root group. Used to know whether there
6962 ++ * are groups with more than one active @bfq_entity
6963 ++ * (see the comments to the function
6964 ++ * bfq_bfqq_must_not_expire()).
6965 ++ *
6966 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
6967 ++ * there is a set of bfq_groups, each one collecting the lower-level
6968 ++ * entities belonging to the group that are acting on the same device.
6969 ++ *
6970 ++ * Locking works as follows:
6971 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
6972 ++ * via RCU from its readers.
6973 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
6974 ++ * from the readers.
6975 ++ * o All the other fields are protected by the @bfqd queue lock.
6976 ++ */
6977 ++struct bfq_group {
6978 ++ struct bfq_entity entity;
6979 ++ struct bfq_sched_data sched_data;
6980 ++
6981 ++ struct hlist_node group_node;
6982 ++ struct hlist_node bfqd_node;
6983 ++
6984 ++ void *bfqd;
6985 ++
6986 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6987 ++ struct bfq_queue *async_idle_bfqq;
6988 ++
6989 ++ struct bfq_entity *my_entity;
6990 ++
6991 ++ int active_entities;
6992 ++};
6993 ++
6994 ++/**
6995 ++ * struct bfqio_cgroup - bfq cgroup data structure.
6996 ++ * @css: subsystem state for bfq in the containing cgroup.
6997 ++ * @online: flag marked when the subsystem is inserted.
6998 ++ * @weight: cgroup weight.
6999 ++ * @ioprio: cgroup ioprio.
7000 ++ * @ioprio_class: cgroup ioprio_class.
7001 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
7002 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
7003 ++ *
7004 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
7005 ++ * @ioprio and @ioprio_class are protected by @lock.
7006 ++ */
7007 ++struct bfqio_cgroup {
7008 ++ struct cgroup_subsys_state css;
7009 ++ bool online;
7010 ++
7011 ++ unsigned short weight, ioprio, ioprio_class;
7012 ++
7013 ++ spinlock_t lock;
7014 ++ struct hlist_head group_data;
7015 ++};
7016 ++#else
7017 ++struct bfq_group {
7018 ++ struct bfq_sched_data sched_data;
7019 ++
7020 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
7021 ++ struct bfq_queue *async_idle_bfqq;
7022 ++};
7023 ++#endif
7024 ++
7025 ++static inline struct bfq_service_tree *
7026 ++bfq_entity_service_tree(struct bfq_entity *entity)
7027 ++{
7028 ++ struct bfq_sched_data *sched_data = entity->sched_data;
7029 ++ unsigned int idx = entity->ioprio_class - 1;
7030 ++
7031 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
7032 ++ BUG_ON(sched_data == NULL);
7033 ++
7034 ++ return sched_data->service_tree + idx;
7035 ++}
7036 ++
7037 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
7038 ++ bool is_sync)
7039 ++{
7040 ++ return bic->bfqq[is_sync];
7041 ++}
7042 ++
7043 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
7044 ++ struct bfq_queue *bfqq, bool is_sync)
7045 ++{
7046 ++ bic->bfqq[is_sync] = bfqq;
7047 ++}
7048 ++
7049 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
7050 ++{
7051 ++ return bic->icq.q->elevator->elevator_data;
7052 ++}
7053 ++
7054 ++/**
7055 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
7056 ++ * @ptr: a pointer to a bfqd.
7057 ++ * @flags: storage for the flags to be saved.
7058 ++ *
7059 ++ * This function allows bfqg->bfqd to be protected by the
7060 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
7061 ++ * under RCU, so the storage for bfqd is assured to be safe as long
7062 ++ * as the RCU read side critical section does not end. After the
7063 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
7064 ++ * sure that no other writer accessed it. If we raced with a writer,
7065 ++ * the function returns NULL, with the queue unlocked, otherwise it
7066 ++ * returns the dereferenced pointer, with the queue locked.
7067 ++ */
7068 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
7069 ++ unsigned long *flags)
7070 ++{
7071 ++ struct bfq_data *bfqd;
7072 ++
7073 ++ rcu_read_lock();
7074 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
7075 ++
7076 ++ if (bfqd != NULL) {
7077 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
7078 ++ if (*ptr == bfqd)
7079 ++ goto out;
7080 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
7081 ++ }
7082 ++
7083 ++ bfqd = NULL;
7084 ++out:
7085 ++ rcu_read_unlock();
7086 ++ return bfqd;
7087 ++}
7088 ++
7089 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
7090 ++ unsigned long *flags)
7091 ++{
7092 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
7093 ++}
7094 ++
7095 ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic);
7096 ++static void bfq_put_queue(struct bfq_queue *bfqq);
7097 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
7098 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
7099 ++ struct bfq_group *bfqg, int is_sync,
7100 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
7101 ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
7102 ++ struct bfq_group *bfqg);
7103 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
7104 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
7105 ++
7106 ++#endif /* _BFQ_H */
7107 +--
7108 +1.9.1
7109 +
7110
7111 diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch
7112 new file mode 100644
7113 index 0000000..305a5b0
7114 --- /dev/null
7115 +++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.3.patch
7116 @@ -0,0 +1,1220 @@
7117 +From 44efc3f611c09e049fe840e640c2bd2ccfde2148 Mon Sep 17 00:00:00 2001
7118 +From: Mauro Andreolini <mauro.andreolini@×××××××.it>
7119 +Date: Fri, 5 Jun 2015 17:45:40 +0200
7120 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r8 for
7121 + 4.3.0
7122 +
7123 +A set of processes may happen to perform interleaved reads, i.e.,requests
7124 +whose union would give rise to a sequential read pattern. There are two
7125 +typical cases: in the first case, processes read fixed-size chunks of
7126 +data at a fixed distance from each other, while in the second case processes
7127 +may read variable-size chunks at variable distances. The latter case occurs
7128 +for example with QEMU, which splits the I/O generated by the guest into
7129 +multiple chunks, and lets these chunks be served by a pool of cooperating
7130 +processes, iteratively assigning the next chunk of I/O to the first
7131 +available process. CFQ uses actual queue merging for the first type of
7132 +rocesses, whereas it uses preemption to get a sequential read pattern out
7133 +of the read requests performed by the second type of processes. In the end
7134 +it uses two different mechanisms to achieve the same goal: boosting the
7135 +throughput with interleaved I/O.
7136 +
7137 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
7138 +sequential read pattern with both types of processes. The main idea is
7139 +checking newly arrived requests against the next request of the active queue
7140 +both in case of actual request insert and in case of request merge. By doing
7141 +so, both the types of processes can be handled by just merging their queues.
7142 +EQM is then simpler and more compact than the pair of mechanisms used in
7143 +CFQ.
7144 +
7145 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
7146 +properly restoring the weight-raising state of a queue when it gets back to
7147 +a non-merged state.
7148 +
7149 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
7150 +Signed-off-by: Arianna Avanzini <avanzini@××××××.com>
7151 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
7152 +---
7153 + block/bfq-iosched.c | 750 +++++++++++++++++++++++++++++++++++++---------------
7154 + block/bfq-sched.c | 28 --
7155 + block/bfq.h | 54 +++-
7156 + 3 files changed, 580 insertions(+), 252 deletions(-)
7157 +
7158 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
7159 +index 773b2ee..71b51c1 100644
7160 +--- a/block/bfq-iosched.c
7161 ++++ b/block/bfq-iosched.c
7162 +@@ -573,6 +573,57 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
7163 + return dur;
7164 + }
7165 +
7166 ++static inline unsigned
7167 ++bfq_bfqq_cooperations(struct bfq_queue *bfqq)
7168 ++{
7169 ++ return bfqq->bic ? bfqq->bic->cooperations : 0;
7170 ++}
7171 ++
7172 ++static inline void
7173 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
7174 ++{
7175 ++ if (bic->saved_idle_window)
7176 ++ bfq_mark_bfqq_idle_window(bfqq);
7177 ++ else
7178 ++ bfq_clear_bfqq_idle_window(bfqq);
7179 ++ if (bic->saved_IO_bound)
7180 ++ bfq_mark_bfqq_IO_bound(bfqq);
7181 ++ else
7182 ++ bfq_clear_bfqq_IO_bound(bfqq);
7183 ++ /* Assuming that the flag in_large_burst is already correctly set */
7184 ++ if (bic->wr_time_left && bfqq->bfqd->low_latency &&
7185 ++ !bfq_bfqq_in_large_burst(bfqq) &&
7186 ++ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
7187 ++ /*
7188 ++ * Start a weight raising period with the duration given by
7189 ++ * the raising_time_left snapshot.
7190 ++ */
7191 ++ if (bfq_bfqq_busy(bfqq))
7192 ++ bfqq->bfqd->wr_busy_queues++;
7193 ++ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
7194 ++ bfqq->wr_cur_max_time = bic->wr_time_left;
7195 ++ bfqq->last_wr_start_finish = jiffies;
7196 ++ bfqq->entity.ioprio_changed = 1;
7197 ++ }
7198 ++ /*
7199 ++ * Clear wr_time_left to prevent bfq_bfqq_save_state() from
7200 ++ * getting confused about the queue's need of a weight-raising
7201 ++ * period.
7202 ++ */
7203 ++ bic->wr_time_left = 0;
7204 ++}
7205 ++
7206 ++/* Must be called with the queue_lock held. */
7207 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
7208 ++{
7209 ++ int process_refs, io_refs;
7210 ++
7211 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
7212 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
7213 ++ BUG_ON(process_refs < 0);
7214 ++ return process_refs;
7215 ++}
7216 ++
7217 + /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
7218 + static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
7219 + struct bfq_queue *bfqq)
7220 +@@ -817,7 +868,7 @@ static void bfq_add_request(struct request *rq)
7221 + bfq_rq_pos_tree_add(bfqd, bfqq);
7222 +
7223 + if (!bfq_bfqq_busy(bfqq)) {
7224 +- bool soft_rt,
7225 ++ bool soft_rt, coop_or_in_burst,
7226 + idle_for_long_time = time_is_before_jiffies(
7227 + bfqq->budget_timeout +
7228 + bfqd->bfq_wr_min_idle_time);
7229 +@@ -841,11 +892,12 @@ static void bfq_add_request(struct request *rq)
7230 + bfqd->last_ins_in_burst = jiffies;
7231 + }
7232 +
7233 ++ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
7234 ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
7235 + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
7236 +- !bfq_bfqq_in_large_burst(bfqq) &&
7237 ++ !coop_or_in_burst &&
7238 + time_is_before_jiffies(bfqq->soft_rt_next_start);
7239 +- interactive = !bfq_bfqq_in_large_burst(bfqq) &&
7240 +- idle_for_long_time;
7241 ++ interactive = !coop_or_in_burst && idle_for_long_time;
7242 + entity->budget = max_t(unsigned long, bfqq->max_budget,
7243 + bfq_serv_to_charge(next_rq, bfqq));
7244 +
7245 +@@ -864,11 +916,20 @@ static void bfq_add_request(struct request *rq)
7246 + if (!bfqd->low_latency)
7247 + goto add_bfqq_busy;
7248 +
7249 ++ if (bfq_bfqq_just_split(bfqq))
7250 ++ goto set_ioprio_changed;
7251 ++
7252 + /*
7253 +- * If the queue is not being boosted and has been idle
7254 +- * for enough time, start a weight-raising period
7255 ++ * If the queue:
7256 ++ * - is not being boosted,
7257 ++ * - has been idle for enough time,
7258 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
7259 ++ * shared "for its nature" or it is not shared and its
7260 ++ * requests have not been redirected to a shared queue)
7261 ++ * start a weight-raising period.
7262 + */
7263 +- if (old_wr_coeff == 1 && (interactive || soft_rt)) {
7264 ++ if (old_wr_coeff == 1 && (interactive || soft_rt) &&
7265 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
7266 + bfqq->wr_coeff = bfqd->bfq_wr_coeff;
7267 + if (interactive)
7268 + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
7269 +@@ -882,7 +943,7 @@ static void bfq_add_request(struct request *rq)
7270 + } else if (old_wr_coeff > 1) {
7271 + if (interactive)
7272 + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
7273 +- else if (bfq_bfqq_in_large_burst(bfqq) ||
7274 ++ else if (coop_or_in_burst ||
7275 + (bfqq->wr_cur_max_time ==
7276 + bfqd->bfq_wr_rt_max_time &&
7277 + !soft_rt)) {
7278 +@@ -901,18 +962,18 @@ static void bfq_add_request(struct request *rq)
7279 + /*
7280 + *
7281 + * The remaining weight-raising time is lower
7282 +- * than bfqd->bfq_wr_rt_max_time, which
7283 +- * means that the application is enjoying
7284 +- * weight raising either because deemed soft-
7285 +- * rt in the near past, or because deemed
7286 +- * interactive a long ago. In both cases,
7287 +- * resetting now the current remaining weight-
7288 +- * raising time for the application to the
7289 +- * weight-raising duration for soft rt
7290 +- * applications would not cause any latency
7291 +- * increase for the application (as the new
7292 +- * duration would be higher than the remaining
7293 +- * time).
7294 ++ * than bfqd->bfq_wr_rt_max_time, which means
7295 ++ * that the application is enjoying weight
7296 ++ * raising either because deemed soft-rt in
7297 ++ * the near past, or because deemed interactive
7298 ++ * a long ago.
7299 ++ * In both cases, resetting now the current
7300 ++ * remaining weight-raising time for the
7301 ++ * application to the weight-raising duration
7302 ++ * for soft rt applications would not cause any
7303 ++ * latency increase for the application (as the
7304 ++ * new duration would be higher than the
7305 ++ * remaining time).
7306 + *
7307 + * In addition, the application is now meeting
7308 + * the requirements for being deemed soft rt.
7309 +@@ -947,6 +1008,7 @@ static void bfq_add_request(struct request *rq)
7310 + bfqd->bfq_wr_rt_max_time;
7311 + }
7312 + }
7313 ++set_ioprio_changed:
7314 + if (old_wr_coeff != bfqq->wr_coeff)
7315 + entity->ioprio_changed = 1;
7316 + add_bfqq_busy:
7317 +@@ -1167,90 +1229,35 @@ static void bfq_end_wr(struct bfq_data *bfqd)
7318 + spin_unlock_irq(bfqd->queue->queue_lock);
7319 + }
7320 +
7321 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
7322 +- struct bio *bio)
7323 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
7324 + {
7325 +- struct bfq_data *bfqd = q->elevator->elevator_data;
7326 +- struct bfq_io_cq *bic;
7327 +- struct bfq_queue *bfqq;
7328 +-
7329 +- /*
7330 +- * Disallow merge of a sync bio into an async request.
7331 +- */
7332 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
7333 +- return 0;
7334 +-
7335 +- /*
7336 +- * Lookup the bfqq that this bio will be queued with. Allow
7337 +- * merge only if rq is queued there.
7338 +- * Queue lock is held here.
7339 +- */
7340 +- bic = bfq_bic_lookup(bfqd, current->io_context);
7341 +- if (bic == NULL)
7342 +- return 0;
7343 +-
7344 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7345 +- return bfqq == RQ_BFQQ(rq);
7346 +-}
7347 +-
7348 +-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
7349 +- struct bfq_queue *bfqq)
7350 +-{
7351 +- if (bfqq != NULL) {
7352 +- bfq_mark_bfqq_must_alloc(bfqq);
7353 +- bfq_mark_bfqq_budget_new(bfqq);
7354 +- bfq_clear_bfqq_fifo_expire(bfqq);
7355 +-
7356 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
7357 +-
7358 +- bfq_log_bfqq(bfqd, bfqq,
7359 +- "set_in_service_queue, cur-budget = %lu",
7360 +- bfqq->entity.budget);
7361 +- }
7362 +-
7363 +- bfqd->in_service_queue = bfqq;
7364 +-}
7365 +-
7366 +-/*
7367 +- * Get and set a new queue for service.
7368 +- */
7369 +-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
7370 +- struct bfq_queue *bfqq)
7371 +-{
7372 +- if (!bfqq)
7373 +- bfqq = bfq_get_next_queue(bfqd);
7374 ++ if (request)
7375 ++ return blk_rq_pos(io_struct);
7376 + else
7377 +- bfq_get_next_queue_forced(bfqd, bfqq);
7378 +-
7379 +- __bfq_set_in_service_queue(bfqd, bfqq);
7380 +- return bfqq;
7381 ++ return ((struct bio *)io_struct)->bi_iter.bi_sector;
7382 + }
7383 +
7384 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
7385 +- struct request *rq)
7386 ++static inline sector_t bfq_dist_from(sector_t pos1,
7387 ++ sector_t pos2)
7388 + {
7389 +- if (blk_rq_pos(rq) >= bfqd->last_position)
7390 +- return blk_rq_pos(rq) - bfqd->last_position;
7391 ++ if (pos1 >= pos2)
7392 ++ return pos1 - pos2;
7393 + else
7394 +- return bfqd->last_position - blk_rq_pos(rq);
7395 ++ return pos2 - pos1;
7396 + }
7397 +
7398 +-/*
7399 +- * Return true if bfqq has no request pending and rq is close enough to
7400 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
7401 +- * bfqq->next_rq
7402 +- */
7403 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
7404 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
7405 ++ sector_t sector)
7406 + {
7407 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
7408 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
7409 ++ BFQQ_SEEK_THR;
7410 + }
7411 +
7412 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
7413 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
7414 + {
7415 + struct rb_root *root = &bfqd->rq_pos_tree;
7416 + struct rb_node *parent, *node;
7417 + struct bfq_queue *__bfqq;
7418 +- sector_t sector = bfqd->last_position;
7419 +
7420 + if (RB_EMPTY_ROOT(root))
7421 + return NULL;
7422 +@@ -1269,7 +1276,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
7423 + * next_request position).
7424 + */
7425 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
7426 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
7427 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
7428 + return __bfqq;
7429 +
7430 + if (blk_rq_pos(__bfqq->next_rq) < sector)
7431 +@@ -1280,7 +1287,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
7432 + return NULL;
7433 +
7434 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
7435 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
7436 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
7437 + return __bfqq;
7438 +
7439 + return NULL;
7440 +@@ -1289,14 +1296,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
7441 + /*
7442 + * bfqd - obvious
7443 + * cur_bfqq - passed in so that we don't decide that the current queue
7444 +- * is closely cooperating with itself.
7445 +- *
7446 +- * We are assuming that cur_bfqq has dispatched at least one request,
7447 +- * and that bfqd->last_position reflects a position on the disk associated
7448 +- * with the I/O issued by cur_bfqq.
7449 ++ * is closely cooperating with itself
7450 ++ * sector - used as a reference point to search for a close queue
7451 + */
7452 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
7453 +- struct bfq_queue *cur_bfqq)
7454 ++ struct bfq_queue *cur_bfqq,
7455 ++ sector_t sector)
7456 + {
7457 + struct bfq_queue *bfqq;
7458 +
7459 +@@ -1316,7 +1321,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
7460 + * working closely on the same area of the disk. In that case,
7461 + * we can group them together and don't waste time idling.
7462 + */
7463 +- bfqq = bfqq_close(bfqd);
7464 ++ bfqq = bfqq_close(bfqd, sector);
7465 + if (bfqq == NULL || bfqq == cur_bfqq)
7466 + return NULL;
7467 +
7468 +@@ -1343,6 +1348,315 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
7469 + return bfqq;
7470 + }
7471 +
7472 ++static struct bfq_queue *
7473 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
7474 ++{
7475 ++ int process_refs, new_process_refs;
7476 ++ struct bfq_queue *__bfqq;
7477 ++
7478 ++ /*
7479 ++ * If there are no process references on the new_bfqq, then it is
7480 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
7481 ++ * may have dropped their last reference (not just their last process
7482 ++ * reference).
7483 ++ */
7484 ++ if (!bfqq_process_refs(new_bfqq))
7485 ++ return NULL;
7486 ++
7487 ++ /* Avoid a circular list and skip interim queue merges. */
7488 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
7489 ++ if (__bfqq == bfqq)
7490 ++ return NULL;
7491 ++ new_bfqq = __bfqq;
7492 ++ }
7493 ++
7494 ++ process_refs = bfqq_process_refs(bfqq);
7495 ++ new_process_refs = bfqq_process_refs(new_bfqq);
7496 ++ /*
7497 ++ * If the process for the bfqq has gone away, there is no
7498 ++ * sense in merging the queues.
7499 ++ */
7500 ++ if (process_refs == 0 || new_process_refs == 0)
7501 ++ return NULL;
7502 ++
7503 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
7504 ++ new_bfqq->pid);
7505 ++
7506 ++ /*
7507 ++ * Merging is just a redirection: the requests of the process
7508 ++ * owning one of the two queues are redirected to the other queue.
7509 ++ * The latter queue, in its turn, is set as shared if this is the
7510 ++ * first time that the requests of some process are redirected to
7511 ++ * it.
7512 ++ *
7513 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
7514 ++ * are in the context of the process owning bfqq, hence we have
7515 ++ * the io_cq of this process. So we can immediately configure this
7516 ++ * io_cq to redirect the requests of the process to new_bfqq.
7517 ++ *
7518 ++ * NOTE, even if new_bfqq coincides with the in-service queue, the
7519 ++ * io_cq of new_bfqq is not available, because, if the in-service
7520 ++ * queue is shared, bfqd->in_service_bic may not point to the
7521 ++ * io_cq of the in-service queue.
7522 ++ * Redirecting the requests of the process owning bfqq to the
7523 ++ * currently in-service queue is in any case the best option, as
7524 ++ * we feed the in-service queue with new requests close to the
7525 ++ * last request served and, by doing so, hopefully increase the
7526 ++ * throughput.
7527 ++ */
7528 ++ bfqq->new_bfqq = new_bfqq;
7529 ++ atomic_add(process_refs, &new_bfqq->ref);
7530 ++ return new_bfqq;
7531 ++}
7532 ++
7533 ++/*
7534 ++ * Attempt to schedule a merge of bfqq with the currently in-service queue
7535 ++ * or with a close queue among the scheduled queues.
7536 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
7537 ++ * structure otherwise.
7538 ++ *
7539 ++ * The OOM queue is not allowed to participate to cooperation: in fact, since
7540 ++ * the requests temporarily redirected to the OOM queue could be redirected
7541 ++ * again to dedicated queues at any time, the state needed to correctly
7542 ++ * handle merging with the OOM queue would be quite complex and expensive
7543 ++ * to maintain. Besides, in such a critical condition as an out of memory,
7544 ++ * the benefits of queue merging may be little relevant, or even negligible.
7545 ++ */
7546 ++static struct bfq_queue *
7547 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
7548 ++ void *io_struct, bool request)
7549 ++{
7550 ++ struct bfq_queue *in_service_bfqq, *new_bfqq;
7551 ++
7552 ++ if (bfqq->new_bfqq)
7553 ++ return bfqq->new_bfqq;
7554 ++
7555 ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
7556 ++ return NULL;
7557 ++
7558 ++ in_service_bfqq = bfqd->in_service_queue;
7559 ++
7560 ++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
7561 ++ !bfqd->in_service_bic ||
7562 ++ unlikely(in_service_bfqq == &bfqd->oom_bfqq))
7563 ++ goto check_scheduled;
7564 ++
7565 ++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
7566 ++ goto check_scheduled;
7567 ++
7568 ++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
7569 ++ goto check_scheduled;
7570 ++
7571 ++ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
7572 ++ goto check_scheduled;
7573 ++
7574 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
7575 ++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
7576 ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
7577 ++ if (new_bfqq != NULL)
7578 ++ return new_bfqq; /* Merge with in-service queue */
7579 ++ }
7580 ++
7581 ++ /*
7582 ++ * Check whether there is a cooperator among currently scheduled
7583 ++ * queues. The only thing we need is that the bio/request is not
7584 ++ * NULL, as we need it to establish whether a cooperator exists.
7585 ++ */
7586 ++check_scheduled:
7587 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
7588 ++ bfq_io_struct_pos(io_struct, request));
7589 ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
7590 ++ return bfq_setup_merge(bfqq, new_bfqq);
7591 ++
7592 ++ return NULL;
7593 ++}
7594 ++
7595 ++static inline void
7596 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
7597 ++{
7598 ++ /*
7599 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
7600 ++ * have already been redirected to a shared queue; both idle window
7601 ++ * and weight raising state have already been saved. Do nothing.
7602 ++ */
7603 ++ if (bfqq->bic == NULL)
7604 ++ return;
7605 ++ if (bfqq->bic->wr_time_left)
7606 ++ /*
7607 ++ * This is the queue of a just-started process, and would
7608 ++ * deserve weight raising: we set wr_time_left to the full
7609 ++ * weight-raising duration to trigger weight-raising when
7610 ++ * and if the queue is split and the first request of the
7611 ++ * queue is enqueued.
7612 ++ */
7613 ++ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
7614 ++ else if (bfqq->wr_coeff > 1) {
7615 ++ unsigned long wr_duration =
7616 ++ jiffies - bfqq->last_wr_start_finish;
7617 ++ /*
7618 ++ * It may happen that a queue's weight raising period lasts
7619 ++ * longer than its wr_cur_max_time, as weight raising is
7620 ++ * handled only when a request is enqueued or dispatched (it
7621 ++ * does not use any timer). If the weight raising period is
7622 ++ * about to end, don't save it.
7623 ++ */
7624 ++ if (bfqq->wr_cur_max_time <= wr_duration)
7625 ++ bfqq->bic->wr_time_left = 0;
7626 ++ else
7627 ++ bfqq->bic->wr_time_left =
7628 ++ bfqq->wr_cur_max_time - wr_duration;
7629 ++ /*
7630 ++ * The bfq_queue is becoming shared or the requests of the
7631 ++ * process owning the queue are being redirected to a shared
7632 ++ * queue. Stop the weight raising period of the queue, as in
7633 ++ * both cases it should not be owned by an interactive or
7634 ++ * soft real-time application.
7635 ++ */
7636 ++ bfq_bfqq_end_wr(bfqq);
7637 ++ } else
7638 ++ bfqq->bic->wr_time_left = 0;
7639 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
7640 ++ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
7641 ++ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
7642 ++ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
7643 ++ bfqq->bic->cooperations++;
7644 ++ bfqq->bic->failed_cooperations = 0;
7645 ++}
7646 ++
7647 ++static inline void
7648 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
7649 ++{
7650 ++ /*
7651 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
7652 ++ * is about to begin using a shared bfq_queue.
7653 ++ */
7654 ++ if (bfqq->bic)
7655 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
7656 ++}
7657 ++
7658 ++static void
7659 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
7660 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
7661 ++{
7662 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
7663 ++ (long unsigned)new_bfqq->pid);
7664 ++ /* Save weight raising and idle window of the merged queues */
7665 ++ bfq_bfqq_save_state(bfqq);
7666 ++ bfq_bfqq_save_state(new_bfqq);
7667 ++ if (bfq_bfqq_IO_bound(bfqq))
7668 ++ bfq_mark_bfqq_IO_bound(new_bfqq);
7669 ++ bfq_clear_bfqq_IO_bound(bfqq);
7670 ++ /*
7671 ++ * Grab a reference to the bic, to prevent it from being destroyed
7672 ++ * before being possibly touched by a bfq_split_bfqq().
7673 ++ */
7674 ++ bfq_get_bic_reference(bfqq);
7675 ++ bfq_get_bic_reference(new_bfqq);
7676 ++ /*
7677 ++ * Merge queues (that is, let bic redirect its requests to new_bfqq)
7678 ++ */
7679 ++ bic_set_bfqq(bic, new_bfqq, 1);
7680 ++ bfq_mark_bfqq_coop(new_bfqq);
7681 ++ /*
7682 ++ * new_bfqq now belongs to at least two bics (it is a shared queue):
7683 ++ * set new_bfqq->bic to NULL. bfqq either:
7684 ++ * - does not belong to any bic any more, and hence bfqq->bic must
7685 ++ * be set to NULL, or
7686 ++ * - is a queue whose owning bics have already been redirected to a
7687 ++ * different queue, hence the queue is destined to not belong to
7688 ++ * any bic soon and bfqq->bic is already NULL (therefore the next
7689 ++ * assignment causes no harm).
7690 ++ */
7691 ++ new_bfqq->bic = NULL;
7692 ++ bfqq->bic = NULL;
7693 ++ bfq_put_queue(bfqq);
7694 ++}
7695 ++
7696 ++static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
7697 ++{
7698 ++ struct bfq_io_cq *bic = bfqq->bic;
7699 ++ struct bfq_data *bfqd = bfqq->bfqd;
7700 ++
7701 ++ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
7702 ++ bic->failed_cooperations++;
7703 ++ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
7704 ++ bic->cooperations = 0;
7705 ++ }
7706 ++}
7707 ++
7708 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
7709 ++ struct bio *bio)
7710 ++{
7711 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7712 ++ struct bfq_io_cq *bic;
7713 ++ struct bfq_queue *bfqq, *new_bfqq;
7714 ++
7715 ++ /*
7716 ++ * Disallow merge of a sync bio into an async request.
7717 ++ */
7718 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
7719 ++ return 0;
7720 ++
7721 ++ /*
7722 ++ * Lookup the bfqq that this bio will be queued with. Allow
7723 ++ * merge only if rq is queued there.
7724 ++ * Queue lock is held here.
7725 ++ */
7726 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
7727 ++ if (bic == NULL)
7728 ++ return 0;
7729 ++
7730 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7731 ++ /*
7732 ++ * We take advantage of this function to perform an early merge
7733 ++ * of the queues of possible cooperating processes.
7734 ++ */
7735 ++ if (bfqq != NULL) {
7736 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
7737 ++ if (new_bfqq != NULL) {
7738 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
7739 ++ /*
7740 ++ * If we get here, the bio will be queued in the
7741 ++ * shared queue, i.e., new_bfqq, so use new_bfqq
7742 ++ * to decide whether bio and rq can be merged.
7743 ++ */
7744 ++ bfqq = new_bfqq;
7745 ++ } else
7746 ++ bfq_bfqq_increase_failed_cooperations(bfqq);
7747 ++ }
7748 ++
7749 ++ return bfqq == RQ_BFQQ(rq);
7750 ++}
7751 ++
7752 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
7753 ++ struct bfq_queue *bfqq)
7754 ++{
7755 ++ if (bfqq != NULL) {
7756 ++ bfq_mark_bfqq_must_alloc(bfqq);
7757 ++ bfq_mark_bfqq_budget_new(bfqq);
7758 ++ bfq_clear_bfqq_fifo_expire(bfqq);
7759 ++
7760 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
7761 ++
7762 ++ bfq_log_bfqq(bfqd, bfqq,
7763 ++ "set_in_service_queue, cur-budget = %lu",
7764 ++ bfqq->entity.budget);
7765 ++ }
7766 ++
7767 ++ bfqd->in_service_queue = bfqq;
7768 ++}
7769 ++
7770 ++/*
7771 ++ * Get and set a new queue for service.
7772 ++ */
7773 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
7774 ++{
7775 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
7776 ++
7777 ++ __bfq_set_in_service_queue(bfqd, bfqq);
7778 ++ return bfqq;
7779 ++}
7780 ++
7781 + /*
7782 + * If enough samples have been computed, return the current max budget
7783 + * stored in bfqd, which is dynamically updated according to the
7784 +@@ -1488,61 +1802,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
7785 + return rq;
7786 + }
7787 +
7788 +-/* Must be called with the queue_lock held. */
7789 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
7790 +-{
7791 +- int process_refs, io_refs;
7792 +-
7793 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
7794 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
7795 +- BUG_ON(process_refs < 0);
7796 +- return process_refs;
7797 +-}
7798 +-
7799 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
7800 +-{
7801 +- int process_refs, new_process_refs;
7802 +- struct bfq_queue *__bfqq;
7803 +-
7804 +- /*
7805 +- * If there are no process references on the new_bfqq, then it is
7806 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
7807 +- * may have dropped their last reference (not just their last process
7808 +- * reference).
7809 +- */
7810 +- if (!bfqq_process_refs(new_bfqq))
7811 +- return;
7812 +-
7813 +- /* Avoid a circular list and skip interim queue merges. */
7814 +- while ((__bfqq = new_bfqq->new_bfqq)) {
7815 +- if (__bfqq == bfqq)
7816 +- return;
7817 +- new_bfqq = __bfqq;
7818 +- }
7819 +-
7820 +- process_refs = bfqq_process_refs(bfqq);
7821 +- new_process_refs = bfqq_process_refs(new_bfqq);
7822 +- /*
7823 +- * If the process for the bfqq has gone away, there is no
7824 +- * sense in merging the queues.
7825 +- */
7826 +- if (process_refs == 0 || new_process_refs == 0)
7827 +- return;
7828 +-
7829 +- /*
7830 +- * Merge in the direction of the lesser amount of work.
7831 +- */
7832 +- if (new_process_refs >= process_refs) {
7833 +- bfqq->new_bfqq = new_bfqq;
7834 +- atomic_add(process_refs, &new_bfqq->ref);
7835 +- } else {
7836 +- new_bfqq->new_bfqq = bfqq;
7837 +- atomic_add(new_process_refs, &bfqq->ref);
7838 +- }
7839 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
7840 +- new_bfqq->pid);
7841 +-}
7842 +-
7843 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
7844 + {
7845 + struct bfq_entity *entity = &bfqq->entity;
7846 +@@ -2269,7 +2528,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
7847 + */
7848 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
7849 + {
7850 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
7851 ++ struct bfq_queue *bfqq;
7852 + struct request *next_rq;
7853 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
7854 +
7855 +@@ -2279,17 +2538,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
7856 +
7857 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
7858 +
7859 +- /*
7860 +- * If another queue has a request waiting within our mean seek
7861 +- * distance, let it run. The expire code will check for close
7862 +- * cooperators and put the close queue at the front of the
7863 +- * service tree. If possible, merge the expiring queue with the
7864 +- * new bfqq.
7865 +- */
7866 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
7867 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
7868 +- bfq_setup_merge(bfqq, new_bfqq);
7869 +-
7870 + if (bfq_may_expire_for_budg_timeout(bfqq) &&
7871 + !timer_pending(&bfqd->idle_slice_timer) &&
7872 + !bfq_bfqq_must_idle(bfqq))
7873 +@@ -2328,10 +2576,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
7874 + bfq_clear_bfqq_wait_request(bfqq);
7875 + del_timer(&bfqd->idle_slice_timer);
7876 + }
7877 +- if (new_bfqq == NULL)
7878 +- goto keep_queue;
7879 +- else
7880 +- goto expire;
7881 ++ goto keep_queue;
7882 + }
7883 + }
7884 +
7885 +@@ -2340,40 +2585,30 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
7886 + * for a new request, or has requests waiting for a completion and
7887 + * may idle after their completion, then keep it anyway.
7888 + */
7889 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
7890 +- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
7891 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
7892 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
7893 + bfqq = NULL;
7894 + goto keep_queue;
7895 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
7896 +- /*
7897 +- * Expiring the queue because there is a close cooperator,
7898 +- * cancel timer.
7899 +- */
7900 +- bfq_clear_bfqq_wait_request(bfqq);
7901 +- del_timer(&bfqd->idle_slice_timer);
7902 + }
7903 +
7904 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
7905 + expire:
7906 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
7907 + new_queue:
7908 +- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
7909 ++ bfqq = bfq_set_in_service_queue(bfqd);
7910 + bfq_log(bfqd, "select_queue: new queue %d returned",
7911 + bfqq != NULL ? bfqq->pid : 0);
7912 + keep_queue:
7913 + return bfqq;
7914 + }
7915 +
7916 +-static void bfq_update_wr_data(struct bfq_data *bfqd,
7917 +- struct bfq_queue *bfqq)
7918 ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7919 + {
7920 +- if (bfqq->wr_coeff > 1) { /* queue is being boosted */
7921 +- struct bfq_entity *entity = &bfqq->entity;
7922 +-
7923 ++ struct bfq_entity *entity = &bfqq->entity;
7924 ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
7925 + bfq_log_bfqq(bfqd, bfqq,
7926 + "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
7927 +- jiffies_to_msecs(jiffies -
7928 +- bfqq->last_wr_start_finish),
7929 ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
7930 + jiffies_to_msecs(bfqq->wr_cur_max_time),
7931 + bfqq->wr_coeff,
7932 + bfqq->entity.weight, bfqq->entity.orig_weight);
7933 +@@ -2382,12 +2617,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,
7934 + entity->orig_weight * bfqq->wr_coeff);
7935 + if (entity->ioprio_changed)
7936 + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
7937 ++
7938 + /*
7939 + * If the queue was activated in a burst, or
7940 + * too much time has elapsed from the beginning
7941 +- * of this weight-raising, then end weight raising.
7942 ++ * of this weight-raising period, or the queue has
7943 ++ * exceeded the acceptable number of cooperations,
7944 ++ * then end weight raising.
7945 + */
7946 + if (bfq_bfqq_in_large_burst(bfqq) ||
7947 ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
7948 + time_is_before_jiffies(bfqq->last_wr_start_finish +
7949 + bfqq->wr_cur_max_time)) {
7950 + bfqq->last_wr_start_finish = jiffies;
7951 +@@ -2396,11 +2635,13 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,
7952 + bfqq->last_wr_start_finish,
7953 + jiffies_to_msecs(bfqq->wr_cur_max_time));
7954 + bfq_bfqq_end_wr(bfqq);
7955 +- __bfq_entity_update_weight_prio(
7956 +- bfq_entity_service_tree(entity),
7957 +- entity);
7958 + }
7959 + }
7960 ++ /* Update weight both if it must be raised and if it must be lowered */
7961 ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
7962 ++ __bfq_entity_update_weight_prio(
7963 ++ bfq_entity_service_tree(entity),
7964 ++ entity);
7965 + }
7966 +
7967 + /*
7968 +@@ -2647,6 +2888,25 @@ static inline void bfq_init_icq(struct io_cq *icq)
7969 + struct bfq_io_cq *bic = icq_to_bic(icq);
7970 +
7971 + bic->ttime.last_end_request = jiffies;
7972 ++ /*
7973 ++ * A newly created bic indicates that the process has just
7974 ++ * started doing I/O, and is probably mapping into memory its
7975 ++ * executable and libraries: it definitely needs weight raising.
7976 ++ * There is however the possibility that the process performs,
7977 ++ * for a while, I/O close to some other process. EQM intercepts
7978 ++ * this behavior and may merge the queue corresponding to the
7979 ++ * process with some other queue, BEFORE the weight of the queue
7980 ++ * is raised. Merged queues are not weight-raised (they are assumed
7981 ++ * to belong to processes that benefit only from high throughput).
7982 ++ * If the merge is basically the consequence of an accident, then
7983 ++ * the queue will be split soon and will get back its old weight.
7984 ++ * It is then important to write down somewhere that this queue
7985 ++ * does need weight raising, even if it did not make it to get its
7986 ++ * weight raised before being merged. To this purpose, we overload
7987 ++ * the field raising_time_left and assign 1 to it, to mark the queue
7988 ++ * as needing weight raising.
7989 ++ */
7990 ++ bic->wr_time_left = 1;
7991 + }
7992 +
7993 + static void bfq_exit_icq(struct io_cq *icq)
7994 +@@ -2660,6 +2920,13 @@ static void bfq_exit_icq(struct io_cq *icq)
7995 + }
7996 +
7997 + if (bic->bfqq[BLK_RW_SYNC]) {
7998 ++ /*
7999 ++ * If the bic is using a shared queue, put the reference
8000 ++ * taken on the io_context when the bic started using a
8001 ++ * shared bfq_queue.
8002 ++ */
8003 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
8004 ++ put_io_context(icq->ioc);
8005 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
8006 + bic->bfqq[BLK_RW_SYNC] = NULL;
8007 + }
8008 +@@ -2952,6 +3219,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
8009 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
8010 + return;
8011 +
8012 ++ /* Idle window just restored, statistics are meaningless. */
8013 ++ if (bfq_bfqq_just_split(bfqq))
8014 ++ return;
8015 ++
8016 + enable_idle = bfq_bfqq_idle_window(bfqq);
8017 +
8018 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
8019 +@@ -2999,6 +3270,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8020 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
8021 + !BFQQ_SEEKY(bfqq))
8022 + bfq_update_idle_window(bfqd, bfqq, bic);
8023 ++ bfq_clear_bfqq_just_split(bfqq);
8024 +
8025 + bfq_log_bfqq(bfqd, bfqq,
8026 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
8027 +@@ -3059,12 +3331,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8028 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
8029 + {
8030 + struct bfq_data *bfqd = q->elevator->elevator_data;
8031 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
8032 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
8033 +
8034 + assert_spin_locked(bfqd->queue->queue_lock);
8035 +
8036 ++ /*
8037 ++ * An unplug may trigger a requeue of a request from the device
8038 ++ * driver: make sure we are in process context while trying to
8039 ++ * merge two bfq_queues.
8040 ++ */
8041 ++ if (!in_interrupt()) {
8042 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
8043 ++ if (new_bfqq != NULL) {
8044 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
8045 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
8046 ++ /*
8047 ++ * Release the request's reference to the old bfqq
8048 ++ * and make sure one is taken to the shared queue.
8049 ++ */
8050 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
8051 ++ bfqq->allocated[rq_data_dir(rq)]--;
8052 ++ atomic_inc(&new_bfqq->ref);
8053 ++ bfq_put_queue(bfqq);
8054 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
8055 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
8056 ++ bfqq, new_bfqq);
8057 ++ rq->elv.priv[1] = new_bfqq;
8058 ++ bfqq = new_bfqq;
8059 ++ } else
8060 ++ bfq_bfqq_increase_failed_cooperations(bfqq);
8061 ++ }
8062 ++
8063 + bfq_add_request(rq);
8064 +
8065 ++ /*
8066 ++ * Here a newly-created bfq_queue has already started a weight-raising
8067 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
8068 ++ * from assigning it a full weight-raising period. See the detailed
8069 ++ * comments about this field in bfq_init_icq().
8070 ++ */
8071 ++ if (bfqq->bic != NULL)
8072 ++ bfqq->bic->wr_time_left = 0;
8073 + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
8074 + list_add_tail(&rq->queuelist, &bfqq->fifo);
8075 +
8076 +@@ -3226,18 +3533,6 @@ static void bfq_put_request(struct request *rq)
8077 + }
8078 + }
8079 +
8080 +-static struct bfq_queue *
8081 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
8082 +- struct bfq_queue *bfqq)
8083 +-{
8084 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
8085 +- (long unsigned)bfqq->new_bfqq->pid);
8086 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
8087 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
8088 +- bfq_put_queue(bfqq);
8089 +- return bic_to_bfqq(bic, 1);
8090 +-}
8091 +-
8092 + /*
8093 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
8094 + * was the last process referring to said bfqq.
8095 +@@ -3246,6 +3541,9 @@ static struct bfq_queue *
8096 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
8097 + {
8098 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
8099 ++
8100 ++ put_io_context(bic->icq.ioc);
8101 ++
8102 + if (bfqq_process_refs(bfqq) == 1) {
8103 + bfqq->pid = current->pid;
8104 + bfq_clear_bfqq_coop(bfqq);
8105 +@@ -3274,6 +3572,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
8106 + struct bfq_queue *bfqq;
8107 + struct bfq_group *bfqg;
8108 + unsigned long flags;
8109 ++ bool split = false;
8110 +
8111 + might_sleep_if(gfp_mask & __GFP_WAIT);
8112 +
8113 +@@ -3291,25 +3590,26 @@ new_queue:
8114 + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
8115 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
8116 + bic_set_bfqq(bic, bfqq, is_sync);
8117 ++ if (split && is_sync) {
8118 ++ if ((bic->was_in_burst_list && bfqd->large_burst) ||
8119 ++ bic->saved_in_large_burst)
8120 ++ bfq_mark_bfqq_in_large_burst(bfqq);
8121 ++ else {
8122 ++ bfq_clear_bfqq_in_large_burst(bfqq);
8123 ++ if (bic->was_in_burst_list)
8124 ++ hlist_add_head(&bfqq->burst_list_node,
8125 ++ &bfqd->burst_list);
8126 ++ }
8127 ++ }
8128 + } else {
8129 +- /*
8130 +- * If the queue was seeky for too long, break it apart.
8131 +- */
8132 ++ /* If the queue was seeky for too long, break it apart. */
8133 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
8134 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
8135 + bfqq = bfq_split_bfqq(bic, bfqq);
8136 ++ split = true;
8137 + if (!bfqq)
8138 + goto new_queue;
8139 + }
8140 +-
8141 +- /*
8142 +- * Check to see if this queue is scheduled to merge with
8143 +- * another closely cooperating queue. The merging of queues
8144 +- * happens here as it must be done in process context.
8145 +- * The reference on new_bfqq was taken in merge_bfqqs.
8146 +- */
8147 +- if (bfqq->new_bfqq != NULL)
8148 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
8149 + }
8150 +
8151 + bfqq->allocated[rw]++;
8152 +@@ -3320,6 +3620,26 @@ new_queue:
8153 + rq->elv.priv[0] = bic;
8154 + rq->elv.priv[1] = bfqq;
8155 +
8156 ++ /*
8157 ++ * If a bfq_queue has only one process reference, it is owned
8158 ++ * by only one bfq_io_cq: we can set the bic field of the
8159 ++ * bfq_queue to the address of that structure. Also, if the
8160 ++ * queue has just been split, mark a flag so that the
8161 ++ * information is available to the other scheduler hooks.
8162 ++ */
8163 ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
8164 ++ bfqq->bic = bic;
8165 ++ if (split) {
8166 ++ bfq_mark_bfqq_just_split(bfqq);
8167 ++ /*
8168 ++ * If the queue has just been split from a shared
8169 ++ * queue, restore the idle window and the possible
8170 ++ * weight raising period.
8171 ++ */
8172 ++ bfq_bfqq_resume_state(bfqq, bic);
8173 ++ }
8174 ++ }
8175 ++
8176 + spin_unlock_irqrestore(q->queue_lock, flags);
8177 +
8178 + return 0;
8179 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
8180 +index c343099..d0890c6 100644
8181 +--- a/block/bfq-sched.c
8182 ++++ b/block/bfq-sched.c
8183 +@@ -1085,34 +1085,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
8184 + return bfqq;
8185 + }
8186 +
8187 +-/*
8188 +- * Forced extraction of the given queue.
8189 +- */
8190 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
8191 +- struct bfq_queue *bfqq)
8192 +-{
8193 +- struct bfq_entity *entity;
8194 +- struct bfq_sched_data *sd;
8195 +-
8196 +- BUG_ON(bfqd->in_service_queue != NULL);
8197 +-
8198 +- entity = &bfqq->entity;
8199 +- /*
8200 +- * Bubble up extraction/update from the leaf to the root.
8201 +- */
8202 +- for_each_entity(entity) {
8203 +- sd = entity->sched_data;
8204 +- bfq_update_budget(entity);
8205 +- bfq_update_vtime(bfq_entity_service_tree(entity));
8206 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
8207 +- sd->in_service_entity = entity;
8208 +- sd->next_in_service = NULL;
8209 +- entity->service = 0;
8210 +- }
8211 +-
8212 +- return;
8213 +-}
8214 +-
8215 + static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
8216 + {
8217 + if (bfqd->in_service_bic != NULL) {
8218 +diff --git a/block/bfq.h b/block/bfq.h
8219 +index e350b5f..93d3f6e 100644
8220 +--- a/block/bfq.h
8221 ++++ b/block/bfq.h
8222 +@@ -218,18 +218,21 @@ struct bfq_group;
8223 + * idle @bfq_queue with no outstanding requests, then
8224 + * the task associated with the queue it is deemed as
8225 + * soft real-time (see the comments to the function
8226 +- * bfq_bfqq_softrt_next_start()).
8227 ++ * bfq_bfqq_softrt_next_start())
8228 + * @last_idle_bklogged: time of the last transition of the @bfq_queue from
8229 + * idle to backlogged
8230 + * @service_from_backlogged: cumulative service received from the @bfq_queue
8231 + * since the last transition from idle to
8232 + * backlogged
8233 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
8234 ++ * queue is shared
8235 + *
8236 +- * A bfq_queue is a leaf request queue; it can be associated with an io_context
8237 +- * or more, if it is async or shared between cooperating processes. @cgroup
8238 +- * holds a reference to the cgroup, to be sure that it does not disappear while
8239 +- * a bfqq still references it (mostly to avoid races between request issuing and
8240 +- * task migration followed by cgroup destruction).
8241 ++ * A bfq_queue is a leaf request queue; it can be associated with an
8242 ++ * io_context or more, if it is async or shared between cooperating
8243 ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it
8244 ++ * does not disappear while a bfqq still references it (mostly to avoid
8245 ++ * races between request issuing and task migration followed by cgroup
8246 ++ * destruction).
8247 + * All the fields are protected by the queue lock of the containing bfqd.
8248 + */
8249 + struct bfq_queue {
8250 +@@ -269,6 +272,7 @@ struct bfq_queue {
8251 + unsigned int requests_within_timer;
8252 +
8253 + pid_t pid;
8254 ++ struct bfq_io_cq *bic;
8255 +
8256 + /* weight-raising fields */
8257 + unsigned long wr_cur_max_time;
8258 +@@ -298,12 +302,42 @@ struct bfq_ttime {
8259 + * @icq: associated io_cq structure
8260 + * @bfqq: array of two process queues, the sync and the async
8261 + * @ttime: associated @bfq_ttime struct
8262 ++ * @wr_time_left: snapshot of the time left before weight raising ends
8263 ++ * for the sync queue associated to this process; this
8264 ++ * snapshot is taken to remember this value while the weight
8265 ++ * raising is suspended because the queue is merged with a
8266 ++ * shared queue, and is used to set @raising_cur_max_time
8267 ++ * when the queue is split from the shared queue and its
8268 ++ * weight is raised again
8269 ++ * @saved_idle_window: same purpose as the previous field for the idle
8270 ++ * window
8271 ++ * @saved_IO_bound: same purpose as the previous two fields for the I/O
8272 ++ * bound classification of a queue
8273 ++ * @saved_in_large_burst: same purpose as the previous fields for the
8274 ++ * value of the field keeping the queue's belonging
8275 ++ * to a large burst
8276 ++ * @was_in_burst_list: true if the queue belonged to a burst list
8277 ++ * before its merge with another cooperating queue
8278 ++ * @cooperations: counter of consecutive successful queue merges underwent
8279 ++ * by any of the process' @bfq_queues
8280 ++ * @failed_cooperations: counter of consecutive failed queue merges of any
8281 ++ * of the process' @bfq_queues
8282 + */
8283 + struct bfq_io_cq {
8284 + struct io_cq icq; /* must be the first member */
8285 + struct bfq_queue *bfqq[2];
8286 + struct bfq_ttime ttime;
8287 + int ioprio;
8288 ++
8289 ++ unsigned int wr_time_left;
8290 ++ bool saved_idle_window;
8291 ++ bool saved_IO_bound;
8292 ++
8293 ++ bool saved_in_large_burst;
8294 ++ bool was_in_burst_list;
8295 ++
8296 ++ unsigned int cooperations;
8297 ++ unsigned int failed_cooperations;
8298 + };
8299 +
8300 + enum bfq_device_speed {
8301 +@@ -536,7 +570,7 @@ enum bfqq_state_flags {
8302 + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
8303 + BFQ_BFQQ_FLAG_sync, /* synchronous queue */
8304 + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
8305 +- BFQ_BFQQ_FLAG_IO_bound, /*
8306 ++ BFQ_BFQQ_FLAG_IO_bound, /*
8307 + * bfqq has timed-out at least once
8308 + * having consumed at most 2/10 of
8309 + * its budget
8310 +@@ -549,12 +583,13 @@ enum bfqq_state_flags {
8311 + * bfqq has proved to be slow and
8312 + * seeky until budget timeout
8313 + */
8314 +- BFQ_BFQQ_FLAG_softrt_update, /*
8315 ++ BFQ_BFQQ_FLAG_softrt_update, /*
8316 + * may need softrt-next-start
8317 + * update
8318 + */
8319 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
8320 +- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
8321 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */
8322 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
8323 + };
8324 +
8325 + #define BFQ_BFQQ_FNS(name) \
8326 +@@ -583,6 +618,7 @@ BFQ_BFQQ_FNS(in_large_burst);
8327 + BFQ_BFQQ_FNS(constantly_seeky);
8328 + BFQ_BFQQ_FNS(coop);
8329 + BFQ_BFQQ_FNS(split_coop);
8330 ++BFQ_BFQQ_FNS(just_split);
8331 + BFQ_BFQQ_FNS(softrt_update);
8332 + #undef BFQ_BFQQ_FNS
8333 +
8334 +--
8335 +1.9.1
8336 +