Gentoo Archives: gentoo-commits

From: "Mike Pagano (mpagano)" <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2727 - genpatches-2.6/trunk/3.14
Date: Tue, 01 Apr 2014 18:44:29
Message-Id: 20140401184421.1E8F52005E@flycatcher.gentoo.org
1 Author: mpagano
2 Date: 2014-04-01 18:44:20 +0000 (Tue, 01 Apr 2014)
3 New Revision: 2727
4
5 Added:
6 genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch1
7 Removed:
8 genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch
9 Modified:
10 genpatches-2.6/trunk/3.14/0000_README
11 Log:
12 Fix BFQ patch name to correct PATCH_DEPTH
13
14 Modified: genpatches-2.6/trunk/3.14/0000_README
15 ===================================================================
16 --- genpatches-2.6/trunk/3.14/0000_README 2014-04-01 12:40:19 UTC (rev 2726)
17 +++ genpatches-2.6/trunk/3.14/0000_README 2014-04-01 18:44:20 UTC (rev 2727)
18 @@ -86,7 +86,7 @@
19 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
20 Desc: BFQ v7r2 patch 1 for 3.14: Build, cgroups and kconfig bits
21
22 -Patch: 5002_BFQ-2-block-introduce-the-v7r2-I-O-sched-for-3.14.patch
23 +Patch: 5002_BFQ-2-block-introduce-the-v7r2-I-O-sched-for-3.14.patch1
24 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
25 Desc: BFQ v7r2 patch 2 for 3.14: BFQ Scheduler
26
27
28 Deleted: genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch
29 ===================================================================
30 --- genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch 2014-04-01 12:40:19 UTC (rev 2726)
31 +++ genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch 2014-04-01 18:44:20 UTC (rev 2727)
32 @@ -1,6065 +0,0 @@
33 -From 5055277df59d9280da6b60cf90bed8e5e57dc44d Mon Sep 17 00:00:00 2001
34 -From: Paolo Valente <paolo.valente@×××××××.it>
35 -Date: Thu, 9 May 2013 19:10:02 +0200
36 -Subject: [PATCH 2/3] block: introduce the BFQ-v7r2 I/O sched for 3.14
37 -
38 -Add the BFQ-v7r2 I/O scheduler to 3.14.
39 -The general structure is borrowed from CFQ, as much of the code for
40 -handling I/O contexts. Over time, several useful features have been
41 -ported from CFQ as well (details in the changelog in README.BFQ). A
42 -(bfq_)queue is associated to each task doing I/O on a device, and each
43 -time a scheduling decision has to be made a queue is selected and served
44 -until it expires.
45 -
46 - - Slices are given in the service domain: tasks are assigned
47 - budgets, measured in number of sectors. Once got the disk, a task
48 - must however consume its assigned budget within a configurable
49 - maximum time (by default, the maximum possible value of the
50 - budgets is automatically computed to comply with this timeout).
51 - This allows the desired latency vs "throughput boosting" tradeoff
52 - to be set.
53 -
54 - - Budgets are scheduled according to a variant of WF2Q+, implemented
55 - using an augmented rb-tree to take eligibility into account while
56 - preserving an O(log N) overall complexity.
57 -
58 - - A low-latency tunable is provided; if enabled, both interactive
59 - and soft real-time applications are guaranteed a very low latency.
60 -
61 - - Latency guarantees are preserved also in the presence of NCQ.
62 -
63 - - Also with flash-based devices, a high throughput is achieved
64 - while still preserving latency guarantees.
65 -
66 - - BFQ features Early Queue Merge (EQM), a sort of fusion of the
67 - cooperating-queue-merging and the preemption mechanisms present
68 - in CFQ. EQM is in fact a unified mechanism that tries to get a
69 - sequential read pattern, and hence a high throughput, with any
70 - set of processes performing interleaved I/O over a contiguous
71 - sequence of sectors.
72 -
73 - - BFQ supports full hierarchical scheduling, exporting a cgroups
74 - interface. Since each node has a full scheduler, each group can
75 - be assigned its own weight.
76 -
77 - - If the cgroups interface is not used, only I/O priorities can be
78 - assigned to processes, with ioprio values mapped to weights
79 - with the relation weight = IOPRIO_BE_NR - ioprio.
80 -
81 - - ioprio classes are served in strict priority order, i.e., lower
82 - priority queues are not served as long as there are higher
83 - priority queues. Among queues in the same class the bandwidth is
84 - distributed in proportion to the weight of each queue. A very
85 - thin extra bandwidth is however guaranteed to the Idle class, to
86 - prevent it from starving.
87 -
88 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
89 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
90 ----
91 - block/bfq-cgroup.c | 926 +++++++++++++++
92 - block/bfq-ioc.c | 36 +
93 - block/bfq-iosched.c | 3300 +++++++++++++++++++++++++++++++++++++++++++++++++++
94 - block/bfq-sched.c | 1078 +++++++++++++++++
95 - block/bfq.h | 622 ++++++++++
96 - 5 files changed, 5962 insertions(+)
97 - create mode 100644 block/bfq-cgroup.c
98 - create mode 100644 block/bfq-ioc.c
99 - create mode 100644 block/bfq-iosched.c
100 - create mode 100644 block/bfq-sched.c
101 - create mode 100644 block/bfq.h
102 -
103 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
104 -new file mode 100644
105 -index 0000000..bcecdb4
106 ---- /dev/null
107 -+++ b/block/bfq-cgroup.c
108 -@@ -0,0 +1,926 @@
109 -+/*
110 -+ * BFQ: CGROUPS support.
111 -+ *
112 -+ * Based on ideas and code from CFQ:
113 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
114 -+ *
115 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
116 -+ * Paolo Valente <paolo.valente@×××××××.it>
117 -+ *
118 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
119 -+ *
120 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
121 -+ */
122 -+
123 -+#ifdef CONFIG_CGROUP_BFQIO
124 -+
125 -+static DEFINE_MUTEX(bfqio_mutex);
126 -+
127 -+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
128 -+{
129 -+ return bgrp ? !bgrp->online : false;
130 -+}
131 -+
132 -+static struct bfqio_cgroup bfqio_root_cgroup = {
133 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
134 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
135 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
136 -+};
137 -+
138 -+static inline void bfq_init_entity(struct bfq_entity *entity,
139 -+ struct bfq_group *bfqg)
140 -+{
141 -+ entity->weight = entity->new_weight;
142 -+ entity->orig_weight = entity->new_weight;
143 -+ entity->ioprio = entity->new_ioprio;
144 -+ entity->ioprio_class = entity->new_ioprio_class;
145 -+ entity->parent = bfqg->my_entity;
146 -+ entity->sched_data = &bfqg->sched_data;
147 -+}
148 -+
149 -+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
150 -+{
151 -+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
152 -+}
153 -+
154 -+/*
155 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
156 -+ * of bgrp. Must be called under rcu_read_lock().
157 -+ */
158 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
159 -+ struct bfq_data *bfqd)
160 -+{
161 -+ struct bfq_group *bfqg;
162 -+ void *key;
163 -+
164 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
165 -+ key = rcu_dereference(bfqg->bfqd);
166 -+ if (key == bfqd)
167 -+ return bfqg;
168 -+ }
169 -+
170 -+ return NULL;
171 -+}
172 -+
173 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
174 -+ struct bfq_group *bfqg)
175 -+{
176 -+ struct bfq_entity *entity = &bfqg->entity;
177 -+
178 -+ /*
179 -+ * If the weight of the entity has never been set via the sysfs
180 -+ * interface, then bgrp->weight == 0. In this case we initialize
181 -+ * the weight from the current ioprio value. Otherwise, the group
182 -+ * weight, if set, has priority over the ioprio value.
183 -+ */
184 -+ if (bgrp->weight == 0) {
185 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
186 -+ entity->new_ioprio = bgrp->ioprio;
187 -+ } else {
188 -+ entity->new_weight = bgrp->weight;
189 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
190 -+ }
191 -+ entity->orig_weight = entity->weight = entity->new_weight;
192 -+ entity->ioprio = entity->new_ioprio;
193 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
194 -+ entity->my_sched_data = &bfqg->sched_data;
195 -+}
196 -+
197 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
198 -+ struct bfq_group *parent)
199 -+{
200 -+ struct bfq_entity *entity;
201 -+
202 -+ BUG_ON(parent == NULL);
203 -+ BUG_ON(bfqg == NULL);
204 -+
205 -+ entity = &bfqg->entity;
206 -+ entity->parent = parent->my_entity;
207 -+ entity->sched_data = &parent->sched_data;
208 -+}
209 -+
210 -+/**
211 -+ * bfq_group_chain_alloc - allocate a chain of groups.
212 -+ * @bfqd: queue descriptor.
213 -+ * @css: the leaf cgroup_subsys_state this chain starts from.
214 -+ *
215 -+ * Allocate a chain of groups starting from the one belonging to
216 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
217 -+ * to the root has already an allocated group on @bfqd.
218 -+ */
219 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
220 -+ struct cgroup_subsys_state *css)
221 -+{
222 -+ struct bfqio_cgroup *bgrp;
223 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
224 -+
225 -+ for (; css != NULL; css = css->parent) {
226 -+ bgrp = css_to_bfqio(css);
227 -+
228 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
229 -+ if (bfqg != NULL) {
230 -+ /*
231 -+ * All the cgroups in the path from there to the
232 -+ * root must have a bfq_group for bfqd, so we don't
233 -+ * need any more allocations.
234 -+ */
235 -+ break;
236 -+ }
237 -+
238 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
239 -+ if (bfqg == NULL)
240 -+ goto cleanup;
241 -+
242 -+ bfq_group_init_entity(bgrp, bfqg);
243 -+ bfqg->my_entity = &bfqg->entity;
244 -+
245 -+ if (leaf == NULL) {
246 -+ leaf = bfqg;
247 -+ prev = leaf;
248 -+ } else {
249 -+ bfq_group_set_parent(prev, bfqg);
250 -+ /*
251 -+ * Build a list of allocated nodes using the bfqd
252 -+ * filed, that is still unused and will be initialized
253 -+ * only after the node will be connected.
254 -+ */
255 -+ prev->bfqd = bfqg;
256 -+ prev = bfqg;
257 -+ }
258 -+ }
259 -+
260 -+ return leaf;
261 -+
262 -+cleanup:
263 -+ while (leaf != NULL) {
264 -+ prev = leaf;
265 -+ leaf = leaf->bfqd;
266 -+ kfree(prev);
267 -+ }
268 -+
269 -+ return NULL;
270 -+}
271 -+
272 -+/**
273 -+ * bfq_group_chain_link - link an allocated group chain to a cgroup hierarchy.
274 -+ * @bfqd: the queue descriptor.
275 -+ * @css: the leaf cgroup_subsys_state to start from.
276 -+ * @leaf: the leaf group (to be associated to @cgroup).
277 -+ *
278 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
279 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
280 -+ * hierarchy that already as a group associated to @bfqd all the nodes
281 -+ * in the path to the root cgroup have one too.
282 -+ *
283 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
284 -+ * per device) while the bfqio_cgroup lock protects the list of groups
285 -+ * belonging to the same cgroup.
286 -+ */
287 -+static void bfq_group_chain_link(struct bfq_data *bfqd,
288 -+ struct cgroup_subsys_state *css,
289 -+ struct bfq_group *leaf)
290 -+{
291 -+ struct bfqio_cgroup *bgrp;
292 -+ struct bfq_group *bfqg, *next, *prev = NULL;
293 -+ unsigned long flags;
294 -+
295 -+ assert_spin_locked(bfqd->queue->queue_lock);
296 -+
297 -+ for (; css != NULL && leaf != NULL; css = css->parent) {
298 -+ bgrp = css_to_bfqio(css);
299 -+ next = leaf->bfqd;
300 -+
301 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
302 -+ BUG_ON(bfqg != NULL);
303 -+
304 -+ spin_lock_irqsave(&bgrp->lock, flags);
305 -+
306 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
307 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
308 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
309 -+
310 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
311 -+
312 -+ prev = leaf;
313 -+ leaf = next;
314 -+ }
315 -+
316 -+ BUG_ON(css == NULL && leaf != NULL);
317 -+ if (css != NULL && prev != NULL) {
318 -+ bgrp = css_to_bfqio(css);
319 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
320 -+ bfq_group_set_parent(prev, bfqg);
321 -+ }
322 -+}
323 -+
324 -+/**
325 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
326 -+ * @bfqd: queue descriptor.
327 -+ * @cgroup: cgroup being searched for.
328 -+ *
329 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
330 -+ * necessary. When a group is returned all the cgroups in the path
331 -+ * to the root have a group associated to @bfqd.
332 -+ *
333 -+ * If the allocation fails, return the root group: this breaks guarantees
334 -+ * but is a safe fallback. If this loss becomes a problem it can be
335 -+ * mitigated using the equivalent weight (given by the product of the
336 -+ * weights of the groups in the path from @group to the root) in the
337 -+ * root scheduler.
338 -+ *
339 -+ * We allocate all the missing nodes in the path from the leaf cgroup
340 -+ * to the root and we connect the nodes only after all the allocations
341 -+ * have been successful.
342 -+ */
343 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
344 -+ struct cgroup_subsys_state *css)
345 -+{
346 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
347 -+ struct bfq_group *bfqg;
348 -+
349 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
350 -+ if (bfqg != NULL)
351 -+ return bfqg;
352 -+
353 -+ bfqg = bfq_group_chain_alloc(bfqd, css);
354 -+ if (bfqg != NULL)
355 -+ bfq_group_chain_link(bfqd, css, bfqg);
356 -+ else
357 -+ bfqg = bfqd->root_group;
358 -+
359 -+ return bfqg;
360 -+}
361 -+
362 -+/**
363 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
364 -+ * @bfqd: queue descriptor.
365 -+ * @bfqq: the queue to move.
366 -+ * @entity: @bfqq's entity.
367 -+ * @bfqg: the group to move to.
368 -+ *
369 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
370 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
371 -+ *
372 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
373 -+ * not disappear (by now this just means that we are called under
374 -+ * rcu_read_lock()).
375 -+ */
376 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
377 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
378 -+{
379 -+ int busy, resume;
380 -+
381 -+ busy = bfq_bfqq_busy(bfqq);
382 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
383 -+
384 -+ BUG_ON(resume && !entity->on_st);
385 -+ BUG_ON(busy && !resume && entity->on_st &&
386 -+ bfqq != bfqd->in_service_queue);
387 -+
388 -+ if (busy) {
389 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
390 -+
391 -+ if (!resume)
392 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
393 -+ else
394 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
395 -+ } else if (entity->on_st)
396 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
397 -+
398 -+ /*
399 -+ * Here we use a reference to bfqg. We don't need a refcounter
400 -+ * as the cgroup reference will not be dropped, so that its
401 -+ * destroy() callback will not be invoked.
402 -+ */
403 -+ entity->parent = bfqg->my_entity;
404 -+ entity->sched_data = &bfqg->sched_data;
405 -+
406 -+ if (busy && resume)
407 -+ bfq_activate_bfqq(bfqd, bfqq);
408 -+
409 -+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
410 -+ bfq_schedule_dispatch(bfqd);
411 -+}
412 -+
413 -+/**
414 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
415 -+ * @bfqd: the queue descriptor.
416 -+ * @bic: the bic to move.
417 -+ * @cgroup: the cgroup to move to.
418 -+ *
419 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
420 -+ * has to make sure that the reference to cgroup is valid across the call.
421 -+ *
422 -+ * NOTE: an alternative approach might have been to store the current
423 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
424 -+ * time here, at the price of slightly more complex code.
425 -+ */
426 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
427 -+ struct bfq_io_cq *bic,
428 -+ struct cgroup_subsys_state *css)
429 -+{
430 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
431 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
432 -+ struct bfq_entity *entity;
433 -+ struct bfq_group *bfqg;
434 -+ struct bfqio_cgroup *bgrp;
435 -+
436 -+ bgrp = css_to_bfqio(css);
437 -+
438 -+ bfqg = bfq_find_alloc_group(bfqd, css);
439 -+ if (async_bfqq != NULL) {
440 -+ entity = &async_bfqq->entity;
441 -+
442 -+ if (entity->sched_data != &bfqg->sched_data) {
443 -+ bic_set_bfqq(bic, NULL, 0);
444 -+ bfq_log_bfqq(bfqd, async_bfqq,
445 -+ "bic_change_group: %p %d",
446 -+ async_bfqq, atomic_read(&async_bfqq->ref));
447 -+ bfq_put_queue(async_bfqq);
448 -+ }
449 -+ }
450 -+
451 -+ if (sync_bfqq != NULL) {
452 -+ entity = &sync_bfqq->entity;
453 -+ if (entity->sched_data != &bfqg->sched_data)
454 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
455 -+ }
456 -+
457 -+ return bfqg;
458 -+}
459 -+
460 -+/**
461 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
462 -+ * @bic: the bic being migrated.
463 -+ * @cgroup: the destination cgroup.
464 -+ *
465 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
466 -+ * moved into its new parent group.
467 -+ */
468 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
469 -+ struct cgroup_subsys_state *css)
470 -+{
471 -+ struct bfq_data *bfqd;
472 -+ unsigned long uninitialized_var(flags);
473 -+
474 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
475 -+ &flags);
476 -+ if (bfqd != NULL) {
477 -+ __bfq_bic_change_cgroup(bfqd, bic, css);
478 -+ bfq_put_bfqd_unlock(bfqd, &flags);
479 -+ }
480 -+}
481 -+
482 -+/**
483 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
484 -+ * @bic: the @bic to update.
485 -+ *
486 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
487 -+ * We need this in addition to moving bics during the cgroup attach
488 -+ * phase because the task owning @bic could be at its first disk
489 -+ * access or we may end up in the root cgroup as the result of a
490 -+ * memory allocation failure and here we try to move to the right
491 -+ * group.
492 -+ *
493 -+ * Must be called under the queue lock. It is safe to use the returned
494 -+ * value even after the rcu_read_unlock() as the migration/destruction
495 -+ * paths act under the queue lock too. IOW it is impossible to race with
496 -+ * group migration/destruction and end up with an invalid group as:
497 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
498 -+ * has started execution, as current holds a reference to it,
499 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
500 -+ * migrated to a different cgroup] its attach() callback will have
501 -+ * taken care of remove all the references to the old cgroup data.
502 -+ */
503 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
504 -+{
505 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
506 -+ struct bfq_group *bfqg;
507 -+ struct cgroup_subsys_state *css;
508 -+
509 -+ BUG_ON(bfqd == NULL);
510 -+
511 -+ rcu_read_lock();
512 -+ css = task_css(current, bfqio_subsys_id);
513 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
514 -+ rcu_read_unlock();
515 -+
516 -+ return bfqg;
517 -+}
518 -+
519 -+/**
520 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
521 -+ * @st: the service tree being flushed.
522 -+ */
523 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
524 -+{
525 -+ struct bfq_entity *entity = st->first_idle;
526 -+
527 -+ for (; entity != NULL; entity = st->first_idle)
528 -+ __bfq_deactivate_entity(entity, 0);
529 -+}
530 -+
531 -+/**
532 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
533 -+ * @bfqd: the device data structure with the root group.
534 -+ * @entity: the entity to move.
535 -+ */
536 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
537 -+ struct bfq_entity *entity)
538 -+{
539 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
540 -+
541 -+ BUG_ON(bfqq == NULL);
542 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
543 -+ return;
544 -+}
545 -+
546 -+/**
547 -+ * bfq_reparent_active_entities - move to the root group all active entities.
548 -+ * @bfqd: the device data structure with the root group.
549 -+ * @bfqg: the group to move from.
550 -+ * @st: the service tree with the entities.
551 -+ *
552 -+ * Needs queue_lock to be taken and reference to be valid over the call.
553 -+ */
554 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
555 -+ struct bfq_group *bfqg,
556 -+ struct bfq_service_tree *st)
557 -+{
558 -+ struct rb_root *active = &st->active;
559 -+ struct bfq_entity *entity = NULL;
560 -+
561 -+ if (!RB_EMPTY_ROOT(&st->active))
562 -+ entity = bfq_entity_of(rb_first(active));
563 -+
564 -+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
565 -+ bfq_reparent_leaf_entity(bfqd, entity);
566 -+
567 -+ if (bfqg->sched_data.in_service_entity != NULL)
568 -+ bfq_reparent_leaf_entity(bfqd,
569 -+ bfqg->sched_data.in_service_entity);
570 -+
571 -+ return;
572 -+}
573 -+
574 -+/**
575 -+ * bfq_destroy_group - destroy @bfqg.
576 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
577 -+ * @bfqg: the group being destroyed.
578 -+ *
579 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
580 -+ */
581 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
582 -+{
583 -+ struct bfq_data *bfqd;
584 -+ struct bfq_service_tree *st;
585 -+ struct bfq_entity *entity = bfqg->my_entity;
586 -+ unsigned long uninitialized_var(flags);
587 -+ int i;
588 -+
589 -+ hlist_del(&bfqg->group_node);
590 -+
591 -+ /*
592 -+ * Empty all service_trees belonging to this group before deactivating
593 -+ * the group itself.
594 -+ */
595 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
596 -+ st = bfqg->sched_data.service_tree + i;
597 -+
598 -+ /*
599 -+ * The idle tree may still contain bfq_queues belonging
600 -+ * to exited task because they never migrated to a different
601 -+ * cgroup from the one being destroyed now. No one else
602 -+ * can access them so it's safe to act without any lock.
603 -+ */
604 -+ bfq_flush_idle_tree(st);
605 -+
606 -+ /*
607 -+ * It may happen that some queues are still active
608 -+ * (busy) upon group destruction (if the corresponding
609 -+ * processes have been forced to terminate). We move
610 -+ * all the leaf entities corresponding to these queues
611 -+ * to the root_group.
612 -+ * Also, it may happen that the group has an entity
613 -+ * under service, which is disconnected from the active
614 -+ * tree: it must be moved, too.
615 -+ * There is no need to put the sync queues, as the
616 -+ * scheduler has taken no reference.
617 -+ */
618 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
619 -+ if (bfqd != NULL) {
620 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
621 -+ bfq_put_bfqd_unlock(bfqd, &flags);
622 -+ }
623 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
624 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
625 -+ }
626 -+ BUG_ON(bfqg->sched_data.next_in_service != NULL);
627 -+ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
628 -+
629 -+ /*
630 -+ * We may race with device destruction, take extra care when
631 -+ * dereferencing bfqg->bfqd.
632 -+ */
633 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
634 -+ if (bfqd != NULL) {
635 -+ hlist_del(&bfqg->bfqd_node);
636 -+ __bfq_deactivate_entity(entity, 0);
637 -+ bfq_put_async_queues(bfqd, bfqg);
638 -+ bfq_put_bfqd_unlock(bfqd, &flags);
639 -+ }
640 -+ BUG_ON(entity->tree != NULL);
641 -+
642 -+ /*
643 -+ * No need to defer the kfree() to the end of the RCU grace
644 -+ * period: we are called from the destroy() callback of our
645 -+ * cgroup, so we can be sure that no one is a) still using
646 -+ * this cgroup or b) doing lookups in it.
647 -+ */
648 -+ kfree(bfqg);
649 -+}
650 -+
651 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
652 -+{
653 -+ struct hlist_node *tmp;
654 -+ struct bfq_group *bfqg;
655 -+
656 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
657 -+ bfq_end_raising_async_queues(bfqd, bfqg);
658 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
659 -+}
660 -+
661 -+/**
662 -+ * bfq_disconnect_groups - disconnect @bfqd from all its groups.
663 -+ * @bfqd: the device descriptor being exited.
664 -+ *
665 -+ * When the device exits we just make sure that no lookup can return
666 -+ * the now unused group structures. They will be deallocated on cgroup
667 -+ * destruction.
668 -+ */
669 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
670 -+{
671 -+ struct hlist_node *tmp;
672 -+ struct bfq_group *bfqg;
673 -+
674 -+ bfq_log(bfqd, "disconnect_groups beginning");
675 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
676 -+ hlist_del(&bfqg->bfqd_node);
677 -+
678 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
679 -+
680 -+ /*
681 -+ * Don't remove from the group hash, just set an
682 -+ * invalid key. No lookups can race with the
683 -+ * assignment as bfqd is being destroyed; this
684 -+ * implies also that new elements cannot be added
685 -+ * to the list.
686 -+ */
687 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
688 -+
689 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
690 -+ bfqg);
691 -+ bfq_put_async_queues(bfqd, bfqg);
692 -+ }
693 -+}
694 -+
695 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
696 -+{
697 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
698 -+ struct bfq_group *bfqg = bfqd->root_group;
699 -+
700 -+ bfq_put_async_queues(bfqd, bfqg);
701 -+
702 -+ spin_lock_irq(&bgrp->lock);
703 -+ hlist_del_rcu(&bfqg->group_node);
704 -+ spin_unlock_irq(&bgrp->lock);
705 -+
706 -+ /*
707 -+ * No need to synchronize_rcu() here: since the device is gone
708 -+ * there cannot be any read-side access to its root_group.
709 -+ */
710 -+ kfree(bfqg);
711 -+}
712 -+
713 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
714 -+{
715 -+ struct bfq_group *bfqg;
716 -+ struct bfqio_cgroup *bgrp;
717 -+ int i;
718 -+
719 -+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
720 -+ if (bfqg == NULL)
721 -+ return NULL;
722 -+
723 -+ bfqg->entity.parent = NULL;
724 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
725 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
726 -+
727 -+ bgrp = &bfqio_root_cgroup;
728 -+ spin_lock_irq(&bgrp->lock);
729 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
730 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
731 -+ spin_unlock_irq(&bgrp->lock);
732 -+
733 -+ return bfqg;
734 -+}
735 -+
736 -+#define SHOW_FUNCTION(__VAR) \
737 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
738 -+ struct cftype *cftype) \
739 -+{ \
740 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
741 -+ u64 ret = -ENODEV; \
742 -+ \
743 -+ mutex_lock(&bfqio_mutex); \
744 -+ if (bfqio_is_removed(bgrp)) \
745 -+ goto out_unlock; \
746 -+ \
747 -+ spin_lock_irq(&bgrp->lock); \
748 -+ ret = bgrp->__VAR; \
749 -+ spin_unlock_irq(&bgrp->lock); \
750 -+ \
751 -+out_unlock: \
752 -+ mutex_unlock(&bfqio_mutex); \
753 -+ return ret; \
754 -+}
755 -+
756 -+SHOW_FUNCTION(weight);
757 -+SHOW_FUNCTION(ioprio);
758 -+SHOW_FUNCTION(ioprio_class);
759 -+#undef SHOW_FUNCTION
760 -+
761 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
762 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
763 -+ struct cftype *cftype, \
764 -+ u64 val) \
765 -+{ \
766 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
767 -+ struct bfq_group *bfqg; \
768 -+ int ret = -EINVAL; \
769 -+ \
770 -+ if (val < (__MIN) || val > (__MAX)) \
771 -+ return ret; \
772 -+ \
773 -+ ret = -ENODEV; \
774 -+ mutex_lock(&bfqio_mutex); \
775 -+ if (bfqio_is_removed(bgrp)) \
776 -+ goto out_unlock; \
777 -+ ret = 0; \
778 -+ \
779 -+ spin_lock_irq(&bgrp->lock); \
780 -+ bgrp->__VAR = (unsigned short)val; \
781 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
782 -+ /* \
783 -+ * Setting the ioprio_changed flag of the entity \
784 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
785 -+ * the value of the weight to its ioprio mapping. \
786 -+ * Set the flag only if necessary. \
787 -+ */ \
788 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
789 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
790 -+ /* \
791 -+ * Make sure that the above new value has been \
792 -+ * stored in bfqg->entity.new_##__VAR before \
793 -+ * setting the ioprio_changed flag. In fact, \
794 -+ * this flag may be read asynchronously (in \
795 -+ * critical sections protected by a different \
796 -+ * lock than that held here), and finding this \
797 -+ * flag set may cause the execution of the code \
798 -+ * for updating parameters whose value may \
799 -+ * depend also on bfqg->entity.new_##__VAR (in \
800 -+ * __bfq_entity_update_weight_prio). \
801 -+ * This barrier makes sure that the new value \
802 -+ * of bfqg->entity.new_##__VAR is correctly \
803 -+ * seen in that code. \
804 -+ */ \
805 -+ smp_wmb(); \
806 -+ bfqg->entity.ioprio_changed = 1; \
807 -+ } \
808 -+ } \
809 -+ spin_unlock_irq(&bgrp->lock); \
810 -+ \
811 -+out_unlock: \
812 -+ mutex_unlock(&bfqio_mutex); \
813 -+ return ret; \
814 -+}
815 -+
816 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
817 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
818 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
819 -+#undef STORE_FUNCTION
820 -+
821 -+static struct cftype bfqio_files[] = {
822 -+ {
823 -+ .name = "weight",
824 -+ .read_u64 = bfqio_cgroup_weight_read,
825 -+ .write_u64 = bfqio_cgroup_weight_write,
826 -+ },
827 -+ {
828 -+ .name = "ioprio",
829 -+ .read_u64 = bfqio_cgroup_ioprio_read,
830 -+ .write_u64 = bfqio_cgroup_ioprio_write,
831 -+ },
832 -+ {
833 -+ .name = "ioprio_class",
834 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
835 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
836 -+ },
837 -+ { }, /* terminate */
838 -+};
839 -+
840 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
841 -+ *parent_css)
842 -+{
843 -+ struct bfqio_cgroup *bgrp;
844 -+
845 -+ if (parent_css != NULL) {
846 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
847 -+ if (bgrp == NULL)
848 -+ return ERR_PTR(-ENOMEM);
849 -+ } else
850 -+ bgrp = &bfqio_root_cgroup;
851 -+
852 -+ spin_lock_init(&bgrp->lock);
853 -+ INIT_HLIST_HEAD(&bgrp->group_data);
854 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
855 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
856 -+
857 -+ return &bgrp->css;
858 -+}
859 -+
860 -+/*
861 -+ * We cannot support shared io contexts, as we have no means to support
862 -+ * two tasks with the same ioc in two different groups without major rework
863 -+ * of the main bic/bfqq data structures. By now we allow a task to change
864 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
865 -+ * behavior is that a group containing a task that forked using CLONE_IO
866 -+ * will not be destroyed until the tasks sharing the ioc die.
867 -+ */
868 -+static int bfqio_can_attach(struct cgroup_subsys_state *css,
869 -+ struct cgroup_taskset *tset)
870 -+{
871 -+ struct task_struct *task;
872 -+ struct io_context *ioc;
873 -+ int ret = 0;
874 -+
875 -+ cgroup_taskset_for_each(task, css, tset) {
876 -+ /*
877 -+ * task_lock() is needed to avoid races with
878 -+ * exit_io_context()
879 -+ */
880 -+ task_lock(task);
881 -+ ioc = task->io_context;
882 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
883 -+ /*
884 -+ * ioc == NULL means that the task is either too young
885 -+ * or exiting: if it has still no ioc the ioc can't be
886 -+ * shared, if the task is exiting the attach will fail
887 -+ * anyway, no matter what we return here.
888 -+ */
889 -+ ret = -EINVAL;
890 -+ task_unlock(task);
891 -+ if (ret)
892 -+ break;
893 -+ }
894 -+
895 -+ return ret;
896 -+}
897 -+
898 -+static void bfqio_attach(struct cgroup_subsys_state *css,
899 -+ struct cgroup_taskset *tset)
900 -+{
901 -+ struct task_struct *task;
902 -+ struct io_context *ioc;
903 -+ struct io_cq *icq;
904 -+
905 -+ /*
906 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
907 -+ * new group has not yet been tested.
908 -+ */
909 -+ cgroup_taskset_for_each(task, css, tset) {
910 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
911 -+ if (ioc) {
912 -+ /*
913 -+ * Handle cgroup change here.
914 -+ */
915 -+ rcu_read_lock();
916 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
917 -+ if (!strncmp(
918 -+ icq->q->elevator->type->elevator_name,
919 -+ "bfq", ELV_NAME_MAX))
920 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
921 -+ css);
922 -+ rcu_read_unlock();
923 -+ put_io_context(ioc);
924 -+ }
925 -+ }
926 -+}
927 -+
928 -+static void bfqio_destroy(struct cgroup_subsys_state *css)
929 -+{
930 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
931 -+ struct hlist_node *tmp;
932 -+ struct bfq_group *bfqg;
933 -+
934 -+ /*
935 -+ * Since we are destroying the cgroup, there are no more tasks
936 -+ * referencing it, and all the RCU grace periods that may have
937 -+ * referenced it are ended (as the destruction of the parent
938 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
939 -+ * anything else and we don't need any synchronization.
940 -+ */
941 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
942 -+ bfq_destroy_group(bgrp, bfqg);
943 -+
944 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
945 -+
946 -+ kfree(bgrp);
947 -+}
948 -+
949 -+static int bfqio_css_online(struct cgroup_subsys_state *css)
950 -+{
951 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
952 -+
953 -+ mutex_lock(&bfqio_mutex);
954 -+ bgrp->online = true;
955 -+ mutex_unlock(&bfqio_mutex);
956 -+
957 -+ return 0;
958 -+}
959 -+
960 -+static void bfqio_css_offline(struct cgroup_subsys_state *css)
961 -+{
962 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
963 -+
964 -+ mutex_lock(&bfqio_mutex);
965 -+ bgrp->online = false;
966 -+ mutex_unlock(&bfqio_mutex);
967 -+}
968 -+
969 -+struct cgroup_subsys bfqio_subsys = {
970 -+ .name = "bfqio",
971 -+ .css_alloc = bfqio_create,
972 -+ .css_online = bfqio_css_online,
973 -+ .css_offline = bfqio_css_offline,
974 -+ .can_attach = bfqio_can_attach,
975 -+ .attach = bfqio_attach,
976 -+ .css_free = bfqio_destroy,
977 -+ .subsys_id = bfqio_subsys_id,
978 -+ .base_cftypes = bfqio_files,
979 -+};
980 -+#else
981 -+static inline void bfq_init_entity(struct bfq_entity *entity,
982 -+ struct bfq_group *bfqg)
983 -+{
984 -+ entity->weight = entity->new_weight;
985 -+ entity->orig_weight = entity->new_weight;
986 -+ entity->ioprio = entity->new_ioprio;
987 -+ entity->ioprio_class = entity->new_ioprio_class;
988 -+ entity->sched_data = &bfqg->sched_data;
989 -+}
990 -+
991 -+static inline struct bfq_group *
992 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
993 -+{
994 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
995 -+ return bfqd->root_group;
996 -+}
997 -+
998 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
999 -+ struct bfq_queue *bfqq,
1000 -+ struct bfq_entity *entity,
1001 -+ struct bfq_group *bfqg)
1002 -+{
1003 -+}
1004 -+
1005 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
1006 -+{
1007 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1008 -+}
1009 -+
1010 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1011 -+{
1012 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
1013 -+}
1014 -+
1015 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
1016 -+{
1017 -+ kfree(bfqd->root_group);
1018 -+}
1019 -+
1020 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1021 -+{
1022 -+ struct bfq_group *bfqg;
1023 -+ int i;
1024 -+
1025 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1026 -+ if (bfqg == NULL)
1027 -+ return NULL;
1028 -+
1029 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1030 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1031 -+
1032 -+ return bfqg;
1033 -+}
1034 -+#endif
1035 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1036 -new file mode 100644
1037 -index 0000000..7f6b000
1038 ---- /dev/null
1039 -+++ b/block/bfq-ioc.c
1040 -@@ -0,0 +1,36 @@
1041 -+/*
1042 -+ * BFQ: I/O context handling.
1043 -+ *
1044 -+ * Based on ideas and code from CFQ:
1045 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1046 -+ *
1047 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1048 -+ * Paolo Valente <paolo.valente@×××××××.it>
1049 -+ *
1050 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1051 -+ */
1052 -+
1053 -+/**
1054 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1055 -+ * @icq: the iocontext queue.
1056 -+ */
1057 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1058 -+{
1059 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
1060 -+ return container_of(icq, struct bfq_io_cq, icq);
1061 -+}
1062 -+
1063 -+/**
1064 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1065 -+ * @bfqd: the lookup key.
1066 -+ * @ioc: the io_context of the process doing I/O.
1067 -+ *
1068 -+ * Queue lock must be held.
1069 -+ */
1070 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1071 -+ struct io_context *ioc)
1072 -+{
1073 -+ if (ioc)
1074 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1075 -+ return NULL;
1076 -+}
1077 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1078 -new file mode 100644
1079 -index 0000000..f5f71e4
1080 ---- /dev/null
1081 -+++ b/block/bfq-iosched.c
1082 -@@ -0,0 +1,3300 @@
1083 -+/*
1084 -+ * Budget Fair Queueing (BFQ) disk scheduler.
1085 -+ *
1086 -+ * Based on ideas and code from CFQ:
1087 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1088 -+ *
1089 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1090 -+ * Paolo Valente <paolo.valente@×××××××.it>
1091 -+ *
1092 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1093 -+ *
1094 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1095 -+ *
1096 -+ * BFQ is a proportional share disk scheduling algorithm based on the
1097 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
1098 -+ * number of sectors, to tasks instead of time slices. The disk is not granted
1099 -+ * to the in-service task for a given time slice, but until it has exhausted
1100 -+ * its assigned budget. This change from the time to the service domain allows
1101 -+ * BFQ to distribute the disk bandwidth among tasks as desired, without any
1102 -+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
1103 -+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
1104 -+ * their budgets (more precisely BFQ schedules queues associated to tasks).
1105 -+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
1106 -+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
1107 -+ * latencies to interactive and soft real-time applications.
1108 -+ *
1109 -+ * BFQ is described in [1], where also a reference to the initial, more
1110 -+ * theoretical paper on BFQ can be found. The interested reader can find in
1111 -+ * the latter paper full details on the main algorithm as well as formulas of
1112 -+ * the guarantees, plus formal proofs of all the properties. With respect to
1113 -+ * the version of BFQ presented in these papers, this implementation adds a
1114 -+ * few more heuristics, such as the one that guarantees a low latency to soft
1115 -+ * real-time applications, and a hierarchical extension based on H-WF2Q+.
1116 -+ *
1117 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1118 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1119 -+ * complexity derives from the one introduced with EEVDF in [3].
1120 -+ *
1121 -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1122 -+ * with the BFQ Disk I/O Scheduler'',
1123 -+ * Proceedings of the 5th Annual International Systems and Storage
1124 -+ * Conference (SYSTOR '12), June 2012.
1125 -+ *
1126 -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1127 -+ *
1128 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1129 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1130 -+ * Oct 1997.
1131 -+ *
1132 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1133 -+ *
1134 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1135 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
1136 -+ * Resource Allocation,'' technical report.
1137 -+ *
1138 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1139 -+ */
1140 -+#include <linux/module.h>
1141 -+#include <linux/slab.h>
1142 -+#include <linux/blkdev.h>
1143 -+#include <linux/cgroup.h>
1144 -+#include <linux/elevator.h>
1145 -+#include <linux/jiffies.h>
1146 -+#include <linux/rbtree.h>
1147 -+#include <linux/ioprio.h>
1148 -+#include "bfq.h"
1149 -+#include "blk.h"
1150 -+
1151 -+/* Max number of dispatches in one round of service. */
1152 -+static const int bfq_quantum = 4;
1153 -+
1154 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1155 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1156 -+
1157 -+/* Maximum backwards seek, in KiB. */
1158 -+static const int bfq_back_max = 16 * 1024;
1159 -+
1160 -+/* Penalty of a backwards seek, in number of sectors. */
1161 -+static const int bfq_back_penalty = 2;
1162 -+
1163 -+/* Idling period duration, in jiffies. */
1164 -+static int bfq_slice_idle = HZ / 125;
1165 -+
1166 -+/* Default maximum budget values, in sectors and number of requests. */
1167 -+static const int bfq_default_max_budget = 16 * 1024;
1168 -+static const int bfq_max_budget_async_rq = 4;
1169 -+
1170 -+/*
1171 -+ * Async to sync throughput distribution is controlled as follows:
1172 -+ * when an async request is served, the entity is charged the number
1173 -+ * of sectors of the request, multiplied by the factor below
1174 -+ */
1175 -+static const int bfq_async_charge_factor = 10;
1176 -+
1177 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
1178 -+static const int bfq_timeout_sync = HZ / 8;
1179 -+static int bfq_timeout_async = HZ / 25;
1180 -+
1181 -+struct kmem_cache *bfq_pool;
1182 -+
1183 -+/* Below this threshold (in ms), we consider thinktime immediate. */
1184 -+#define BFQ_MIN_TT 2
1185 -+
1186 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
1187 -+#define BFQ_HW_QUEUE_THRESHOLD 4
1188 -+#define BFQ_HW_QUEUE_SAMPLES 32
1189 -+
1190 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1191 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1192 -+
1193 -+/* Min samples used for peak rate estimation (for autotuning). */
1194 -+#define BFQ_PEAK_RATE_SAMPLES 32
1195 -+
1196 -+/* Shift used for peak rate fixed precision calculations. */
1197 -+#define BFQ_RATE_SHIFT 16
1198 -+
1199 -+/*
1200 -+ * The duration of the weight raising for interactive applications is
1201 -+ * computed automatically (as default behaviour), using the following
1202 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
1203 -+ * disk, and R and T are two reference parameters. In particular, R is
1204 -+ * the peak rate of a reference disk, and T is about the maximum time
1205 -+ * for starting popular large applications on that disk, under BFQ and
1206 -+ * while reading two files in parallel. Finally, BFQ uses two
1207 -+ * different pairs (R, T) depending on whether the disk is rotational
1208 -+ * or non-rotational.
1209 -+ */
1210 -+#define T_rot (msecs_to_jiffies(5500))
1211 -+#define T_nonrot (msecs_to_jiffies(2000))
1212 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1213 -+#define R_rot 17415
1214 -+#define R_nonrot 34791
1215 -+
1216 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1217 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1218 -+
1219 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1220 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1221 -+
1222 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1223 -+
1224 -+#include "bfq-ioc.c"
1225 -+#include "bfq-sched.c"
1226 -+#include "bfq-cgroup.c"
1227 -+
1228 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1229 -+ IOPRIO_CLASS_IDLE)
1230 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1231 -+ IOPRIO_CLASS_RT)
1232 -+
1233 -+#define bfq_sample_valid(samples) ((samples) > 80)
1234 -+
1235 -+/*
1236 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1237 -+ * set (in which case it could also be a direct WRITE).
1238 -+ */
1239 -+static inline int bfq_bio_sync(struct bio *bio)
1240 -+{
1241 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1242 -+ return 1;
1243 -+
1244 -+ return 0;
1245 -+}
1246 -+
1247 -+/*
1248 -+ * Scheduler run of queue, if there are requests pending and no one in the
1249 -+ * driver that will restart queueing.
1250 -+ */
1251 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1252 -+{
1253 -+ if (bfqd->queued != 0) {
1254 -+ bfq_log(bfqd, "schedule dispatch");
1255 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1256 -+ }
1257 -+}
1258 -+
1259 -+/*
1260 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1261 -+ * We choose the request that is closesr to the head right now. Distance
1262 -+ * behind the head is penalized and only allowed to a certain extent.
1263 -+ */
1264 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
1265 -+ struct request *rq1,
1266 -+ struct request *rq2,
1267 -+ sector_t last)
1268 -+{
1269 -+ sector_t s1, s2, d1 = 0, d2 = 0;
1270 -+ unsigned long back_max;
1271 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1272 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1273 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1274 -+
1275 -+ if (rq1 == NULL || rq1 == rq2)
1276 -+ return rq2;
1277 -+ if (rq2 == NULL)
1278 -+ return rq1;
1279 -+
1280 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1281 -+ return rq1;
1282 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1283 -+ return rq2;
1284 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1285 -+ return rq1;
1286 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1287 -+ return rq2;
1288 -+
1289 -+ s1 = blk_rq_pos(rq1);
1290 -+ s2 = blk_rq_pos(rq2);
1291 -+
1292 -+ /*
1293 -+ * By definition, 1KiB is 2 sectors.
1294 -+ */
1295 -+ back_max = bfqd->bfq_back_max * 2;
1296 -+
1297 -+ /*
1298 -+ * Strict one way elevator _except_ in the case where we allow
1299 -+ * short backward seeks which are biased as twice the cost of a
1300 -+ * similar forward seek.
1301 -+ */
1302 -+ if (s1 >= last)
1303 -+ d1 = s1 - last;
1304 -+ else if (s1 + back_max >= last)
1305 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
1306 -+ else
1307 -+ wrap |= BFQ_RQ1_WRAP;
1308 -+
1309 -+ if (s2 >= last)
1310 -+ d2 = s2 - last;
1311 -+ else if (s2 + back_max >= last)
1312 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
1313 -+ else
1314 -+ wrap |= BFQ_RQ2_WRAP;
1315 -+
1316 -+ /* Found required data */
1317 -+
1318 -+ /*
1319 -+ * By doing switch() on the bit mask "wrap" we avoid having to
1320 -+ * check two variables for all permutations: --> faster!
1321 -+ */
1322 -+ switch (wrap) {
1323 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1324 -+ if (d1 < d2)
1325 -+ return rq1;
1326 -+ else if (d2 < d1)
1327 -+ return rq2;
1328 -+ else {
1329 -+ if (s1 >= s2)
1330 -+ return rq1;
1331 -+ else
1332 -+ return rq2;
1333 -+ }
1334 -+
1335 -+ case BFQ_RQ2_WRAP:
1336 -+ return rq1;
1337 -+ case BFQ_RQ1_WRAP:
1338 -+ return rq2;
1339 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1340 -+ default:
1341 -+ /*
1342 -+ * Since both rqs are wrapped,
1343 -+ * start with the one that's further behind head
1344 -+ * (--> only *one* back seek required),
1345 -+ * since back seek takes more time than forward.
1346 -+ */
1347 -+ if (s1 <= s2)
1348 -+ return rq1;
1349 -+ else
1350 -+ return rq2;
1351 -+ }
1352 -+}
1353 -+
1354 -+static struct bfq_queue *
1355 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1356 -+ sector_t sector, struct rb_node **ret_parent,
1357 -+ struct rb_node ***rb_link)
1358 -+{
1359 -+ struct rb_node **p, *parent;
1360 -+ struct bfq_queue *bfqq = NULL;
1361 -+
1362 -+ parent = NULL;
1363 -+ p = &root->rb_node;
1364 -+ while (*p) {
1365 -+ struct rb_node **n;
1366 -+
1367 -+ parent = *p;
1368 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1369 -+
1370 -+ /*
1371 -+ * Sort strictly based on sector. Smallest to the left,
1372 -+ * largest to the right.
1373 -+ */
1374 -+ if (sector > blk_rq_pos(bfqq->next_rq))
1375 -+ n = &(*p)->rb_right;
1376 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
1377 -+ n = &(*p)->rb_left;
1378 -+ else
1379 -+ break;
1380 -+ p = n;
1381 -+ bfqq = NULL;
1382 -+ }
1383 -+
1384 -+ *ret_parent = parent;
1385 -+ if (rb_link)
1386 -+ *rb_link = p;
1387 -+
1388 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1389 -+ (long long unsigned)sector,
1390 -+ bfqq != NULL ? bfqq->pid : 0);
1391 -+
1392 -+ return bfqq;
1393 -+}
1394 -+
1395 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1396 -+{
1397 -+ struct rb_node **p, *parent;
1398 -+ struct bfq_queue *__bfqq;
1399 -+
1400 -+ if (bfqq->pos_root != NULL) {
1401 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1402 -+ bfqq->pos_root = NULL;
1403 -+ }
1404 -+
1405 -+ if (bfq_class_idle(bfqq))
1406 -+ return;
1407 -+ if (!bfqq->next_rq)
1408 -+ return;
1409 -+
1410 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
1411 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1412 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
1413 -+ if (__bfqq == NULL) {
1414 -+ rb_link_node(&bfqq->pos_node, parent, p);
1415 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1416 -+ } else
1417 -+ bfqq->pos_root = NULL;
1418 -+}
1419 -+
1420 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1421 -+ struct bfq_queue *bfqq,
1422 -+ struct request *last)
1423 -+{
1424 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
1425 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
1426 -+ struct request *next = NULL, *prev = NULL;
1427 -+
1428 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1429 -+
1430 -+ if (rbprev != NULL)
1431 -+ prev = rb_entry_rq(rbprev);
1432 -+
1433 -+ if (rbnext != NULL)
1434 -+ next = rb_entry_rq(rbnext);
1435 -+ else {
1436 -+ rbnext = rb_first(&bfqq->sort_list);
1437 -+ if (rbnext && rbnext != &last->rb_node)
1438 -+ next = rb_entry_rq(rbnext);
1439 -+ }
1440 -+
1441 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1442 -+}
1443 -+
1444 -+static void bfq_del_rq_rb(struct request *rq)
1445 -+{
1446 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1447 -+ struct bfq_data *bfqd = bfqq->bfqd;
1448 -+ const int sync = rq_is_sync(rq);
1449 -+
1450 -+ BUG_ON(bfqq->queued[sync] == 0);
1451 -+ bfqq->queued[sync]--;
1452 -+ bfqd->queued--;
1453 -+
1454 -+ elv_rb_del(&bfqq->sort_list, rq);
1455 -+
1456 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1457 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
1458 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1459 -+ /*
1460 -+ * Remove queue from request-position tree as it is empty.
1461 -+ */
1462 -+ if (bfqq->pos_root != NULL) {
1463 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1464 -+ bfqq->pos_root = NULL;
1465 -+ }
1466 -+ }
1467 -+}
1468 -+
1469 -+/* see the definition of bfq_async_charge_factor for details */
1470 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
1471 -+ struct bfq_queue *bfqq)
1472 -+{
1473 -+ return blk_rq_sectors(rq) *
1474 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1475 -+ bfq_async_charge_factor));
1476 -+}
1477 -+
1478 -+/**
1479 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
1480 -+ * @bfqd: the device data the queue belongs to.
1481 -+ * @bfqq: the queue to update.
1482 -+ *
1483 -+ * If the first request of a queue changes we make sure that the queue
1484 -+ * has enough budget to serve at least its first request (if the
1485 -+ * request has grown). We do this because if the queue has not enough
1486 -+ * budget for its first request, it has to go through two dispatch
1487 -+ * rounds to actually get it dispatched.
1488 -+ */
1489 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
1490 -+ struct bfq_queue *bfqq)
1491 -+{
1492 -+ struct bfq_entity *entity = &bfqq->entity;
1493 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1494 -+ struct request *next_rq = bfqq->next_rq;
1495 -+ unsigned long new_budget;
1496 -+
1497 -+ if (next_rq == NULL)
1498 -+ return;
1499 -+
1500 -+ if (bfqq == bfqd->in_service_queue)
1501 -+ /*
1502 -+ * In order not to break guarantees, budgets cannot be
1503 -+ * changed after an entity has been selected.
1504 -+ */
1505 -+ return;
1506 -+
1507 -+ BUG_ON(entity->tree != &st->active);
1508 -+ BUG_ON(entity == entity->sched_data->in_service_entity);
1509 -+
1510 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
1511 -+ bfq_serv_to_charge(next_rq, bfqq));
1512 -+ entity->budget = new_budget;
1513 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1514 -+ bfq_activate_bfqq(bfqd, bfqq);
1515 -+}
1516 -+
1517 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1518 -+{
1519 -+ u64 dur;
1520 -+
1521 -+ if (bfqd->bfq_raising_max_time > 0)
1522 -+ return bfqd->bfq_raising_max_time;
1523 -+
1524 -+ dur = bfqd->RT_prod;
1525 -+ do_div(dur, bfqd->peak_rate);
1526 -+
1527 -+ return dur;
1528 -+}
1529 -+
1530 -+static void bfq_add_rq_rb(struct request *rq)
1531 -+{
1532 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1533 -+ struct bfq_entity *entity = &bfqq->entity;
1534 -+ struct bfq_data *bfqd = bfqq->bfqd;
1535 -+ struct request *next_rq, *prev;
1536 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
1537 -+ int idle_for_long_time = 0;
1538 -+
1539 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1540 -+ bfqq->queued[rq_is_sync(rq)]++;
1541 -+ bfqd->queued++;
1542 -+
1543 -+ elv_rb_add(&bfqq->sort_list, rq);
1544 -+
1545 -+ /*
1546 -+ * Check if this request is a better next-serve candidate.
1547 -+ */
1548 -+ prev = bfqq->next_rq;
1549 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1550 -+ BUG_ON(next_rq == NULL);
1551 -+ bfqq->next_rq = next_rq;
1552 -+
1553 -+ /*
1554 -+ * Adjust priority tree position, if next_rq changes.
1555 -+ */
1556 -+ if (prev != bfqq->next_rq)
1557 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
1558 -+
1559 -+ if (!bfq_bfqq_busy(bfqq)) {
1560 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1561 -+ time_is_before_jiffies(bfqq->soft_rt_next_start);
1562 -+ idle_for_long_time = time_is_before_jiffies(
1563 -+ bfqq->budget_timeout +
1564 -+ bfqd->bfq_raising_min_idle_time);
1565 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
1566 -+ bfq_serv_to_charge(next_rq, bfqq));
1567 -+
1568 -+ if (!bfqd->low_latency)
1569 -+ goto add_bfqq_busy;
1570 -+
1571 -+ /*
1572 -+ * If the queue is not being boosted and has been idle
1573 -+ * for enough time, start a weight-raising period
1574 -+ */
1575 -+ if (old_raising_coeff == 1 &&
1576 -+ (idle_for_long_time || soft_rt)) {
1577 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1578 -+ if (idle_for_long_time)
1579 -+ bfqq->raising_cur_max_time =
1580 -+ bfq_wrais_duration(bfqd);
1581 -+ else
1582 -+ bfqq->raising_cur_max_time =
1583 -+ bfqd->bfq_raising_rt_max_time;
1584 -+ bfq_log_bfqq(bfqd, bfqq,
1585 -+ "wrais starting at %lu, "
1586 -+ "rais_max_time %u",
1587 -+ jiffies,
1588 -+ jiffies_to_msecs(bfqq->
1589 -+ raising_cur_max_time));
1590 -+ } else if (old_raising_coeff > 1) {
1591 -+ if (idle_for_long_time)
1592 -+ bfqq->raising_cur_max_time =
1593 -+ bfq_wrais_duration(bfqd);
1594 -+ else if (bfqq->raising_cur_max_time ==
1595 -+ bfqd->bfq_raising_rt_max_time &&
1596 -+ !soft_rt) {
1597 -+ bfqq->raising_coeff = 1;
1598 -+ bfq_log_bfqq(bfqd, bfqq,
1599 -+ "wrais ending at %lu, "
1600 -+ "rais_max_time %u",
1601 -+ jiffies,
1602 -+ jiffies_to_msecs(bfqq->
1603 -+ raising_cur_max_time));
1604 -+ } else if (time_before(
1605 -+ bfqq->last_rais_start_finish +
1606 -+ bfqq->raising_cur_max_time,
1607 -+ jiffies +
1608 -+ bfqd->bfq_raising_rt_max_time) &&
1609 -+ soft_rt) {
1610 -+ /*
1611 -+ *
1612 -+ * The remaining weight-raising time is lower
1613 -+ * than bfqd->bfq_raising_rt_max_time, which
1614 -+ * means that the application is enjoying
1615 -+ * weight raising either because deemed soft-
1616 -+ * rt in the near past, or because deemed
1617 -+ * interactive a long ago. In both cases,
1618 -+ * resetting now the current remaining weight-
1619 -+ * raising time for the application to the
1620 -+ * weight-raising duration for soft rt
1621 -+ * applications would not cause any latency
1622 -+ * increase for the application (as the new
1623 -+ * duration would be higher than the remaining
1624 -+ * time).
1625 -+ *
1626 -+ * In addition, the application is now meeting
1627 -+ * the requirements for being deemed soft rt.
1628 -+ * In the end we can correctly and safely
1629 -+ * (re)charge the weight-raising duration for
1630 -+ * the application with the weight-raising
1631 -+ * duration for soft rt applications.
1632 -+ *
1633 -+ * In particular, doing this recharge now, i.e.,
1634 -+ * before the weight-raising period for the
1635 -+ * application finishes, reduces the probability
1636 -+ * of the following negative scenario:
1637 -+ * 1) the weight of a soft rt application is
1638 -+ * raised at startup (as for any newly
1639 -+ * created application),
1640 -+ * 2) since the application is not interactive,
1641 -+ * at a certain time weight-raising is
1642 -+ * stopped for the application,
1643 -+ * 3) at that time the application happens to
1644 -+ * still have pending requests, and hence
1645 -+ * is destined to not have a chance to be
1646 -+ * deemed soft rt before these requests are
1647 -+ * completed (see the comments to the
1648 -+ * function bfq_bfqq_softrt_next_start()
1649 -+ * for details on soft rt detection),
1650 -+ * 4) these pending requests experience a high
1651 -+ * latency because the application is not
1652 -+ * weight-raised while they are pending.
1653 -+ */
1654 -+ bfqq->last_rais_start_finish = jiffies;
1655 -+ bfqq->raising_cur_max_time =
1656 -+ bfqd->bfq_raising_rt_max_time;
1657 -+ }
1658 -+ }
1659 -+ if (old_raising_coeff != bfqq->raising_coeff)
1660 -+ entity->ioprio_changed = 1;
1661 -+add_bfqq_busy:
1662 -+ bfqq->last_idle_bklogged = jiffies;
1663 -+ bfqq->service_from_backlogged = 0;
1664 -+ bfq_clear_bfqq_softrt_update(bfqq);
1665 -+ bfq_add_bfqq_busy(bfqd, bfqq);
1666 -+ } else {
1667 -+ if (bfqd->low_latency && old_raising_coeff == 1 &&
1668 -+ !rq_is_sync(rq) &&
1669 -+ time_is_before_jiffies(
1670 -+ bfqq->last_rais_start_finish +
1671 -+ bfqd->bfq_raising_min_inter_arr_async)) {
1672 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1673 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1674 -+
1675 -+ bfqd->raised_busy_queues++;
1676 -+ entity->ioprio_changed = 1;
1677 -+ bfq_log_bfqq(bfqd, bfqq,
1678 -+ "non-idle wrais starting at %lu, "
1679 -+ "rais_max_time %u",
1680 -+ jiffies,
1681 -+ jiffies_to_msecs(bfqq->
1682 -+ raising_cur_max_time));
1683 -+ }
1684 -+ bfq_updated_next_req(bfqd, bfqq);
1685 -+ }
1686 -+
1687 -+ if (bfqd->low_latency &&
1688 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1689 -+ idle_for_long_time))
1690 -+ bfqq->last_rais_start_finish = jiffies;
1691 -+}
1692 -+
1693 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1694 -+{
1695 -+ elv_rb_del(&bfqq->sort_list, rq);
1696 -+ bfqq->queued[rq_is_sync(rq)]--;
1697 -+ bfqq->bfqd->queued--;
1698 -+ bfq_add_rq_rb(rq);
1699 -+}
1700 -+
1701 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1702 -+ struct bio *bio)
1703 -+{
1704 -+ struct task_struct *tsk = current;
1705 -+ struct bfq_io_cq *bic;
1706 -+ struct bfq_queue *bfqq;
1707 -+
1708 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1709 -+ if (bic == NULL)
1710 -+ return NULL;
1711 -+
1712 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1713 -+ if (bfqq != NULL)
1714 -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
1715 -+
1716 -+ return NULL;
1717 -+}
1718 -+
1719 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
1720 -+{
1721 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1722 -+
1723 -+ bfqd->rq_in_driver++;
1724 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1725 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1726 -+ (long long unsigned)bfqd->last_position);
1727 -+}
1728 -+
1729 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1730 -+{
1731 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1732 -+
1733 -+ WARN_ON(bfqd->rq_in_driver == 0);
1734 -+ bfqd->rq_in_driver--;
1735 -+}
1736 -+
1737 -+static void bfq_remove_request(struct request *rq)
1738 -+{
1739 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1740 -+ struct bfq_data *bfqd = bfqq->bfqd;
1741 -+
1742 -+ if (bfqq->next_rq == rq) {
1743 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1744 -+ bfq_updated_next_req(bfqd, bfqq);
1745 -+ }
1746 -+
1747 -+ list_del_init(&rq->queuelist);
1748 -+ bfq_del_rq_rb(rq);
1749 -+
1750 -+ if (rq->cmd_flags & REQ_META) {
1751 -+ WARN_ON(bfqq->meta_pending == 0);
1752 -+ bfqq->meta_pending--;
1753 -+ }
1754 -+}
1755 -+
1756 -+static int bfq_merge(struct request_queue *q, struct request **req,
1757 -+ struct bio *bio)
1758 -+{
1759 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1760 -+ struct request *__rq;
1761 -+
1762 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
1763 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1764 -+ *req = __rq;
1765 -+ return ELEVATOR_FRONT_MERGE;
1766 -+ }
1767 -+
1768 -+ return ELEVATOR_NO_MERGE;
1769 -+}
1770 -+
1771 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
1772 -+ int type)
1773 -+{
1774 -+ if (type == ELEVATOR_FRONT_MERGE) {
1775 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1776 -+
1777 -+ bfq_reposition_rq_rb(bfqq, req);
1778 -+ }
1779 -+}
1780 -+
1781 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1782 -+ struct request *next)
1783 -+{
1784 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1785 -+
1786 -+ /*
1787 -+ * Reposition in fifo if next is older than rq.
1788 -+ */
1789 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1790 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1791 -+ list_move(&rq->queuelist, &next->queuelist);
1792 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
1793 -+ }
1794 -+
1795 -+ if (bfqq->next_rq == next)
1796 -+ bfqq->next_rq = rq;
1797 -+
1798 -+ bfq_remove_request(next);
1799 -+}
1800 -+
1801 -+/* Must be called with bfqq != NULL */
1802 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
1803 -+{
1804 -+ BUG_ON(bfqq == NULL);
1805 -+ if (bfq_bfqq_busy(bfqq))
1806 -+ bfqq->bfqd->raised_busy_queues--;
1807 -+ bfqq->raising_coeff = 1;
1808 -+ bfqq->raising_cur_max_time = 0;
1809 -+ /* Trigger a weight change on the next activation of the queue */
1810 -+ bfqq->entity.ioprio_changed = 1;
1811 -+}
1812 -+
1813 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
1814 -+ struct bfq_group *bfqg)
1815 -+{
1816 -+ int i, j;
1817 -+
1818 -+ for (i = 0; i < 2; i++)
1819 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
1820 -+ if (bfqg->async_bfqq[i][j] != NULL)
1821 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
1822 -+ if (bfqg->async_idle_bfqq != NULL)
1823 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
1824 -+}
1825 -+
1826 -+static void bfq_end_raising(struct bfq_data *bfqd)
1827 -+{
1828 -+ struct bfq_queue *bfqq;
1829 -+
1830 -+ spin_lock_irq(bfqd->queue->queue_lock);
1831 -+
1832 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1833 -+ bfq_bfqq_end_raising(bfqq);
1834 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1835 -+ bfq_bfqq_end_raising(bfqq);
1836 -+ bfq_end_raising_async(bfqd);
1837 -+
1838 -+ spin_unlock_irq(bfqd->queue->queue_lock);
1839 -+}
1840 -+
1841 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1842 -+ struct bio *bio)
1843 -+{
1844 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1845 -+ struct bfq_io_cq *bic;
1846 -+ struct bfq_queue *bfqq;
1847 -+
1848 -+ /*
1849 -+ * Disallow merge of a sync bio into an async request.
1850 -+ */
1851 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1852 -+ return 0;
1853 -+
1854 -+ /*
1855 -+ * Lookup the bfqq that this bio will be queued with. Allow
1856 -+ * merge only if rq is queued there.
1857 -+ * Queue lock is held here.
1858 -+ */
1859 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
1860 -+ if (bic == NULL)
1861 -+ return 0;
1862 -+
1863 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1864 -+ return bfqq == RQ_BFQQ(rq);
1865 -+}
1866 -+
1867 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
1868 -+ struct bfq_queue *bfqq)
1869 -+{
1870 -+ if (bfqq != NULL) {
1871 -+ bfq_mark_bfqq_must_alloc(bfqq);
1872 -+ bfq_mark_bfqq_budget_new(bfqq);
1873 -+ bfq_clear_bfqq_fifo_expire(bfqq);
1874 -+
1875 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1876 -+
1877 -+ bfq_log_bfqq(bfqd, bfqq,
1878 -+ "set_in_service_queue, cur-budget = %lu",
1879 -+ bfqq->entity.budget);
1880 -+ }
1881 -+
1882 -+ bfqd->in_service_queue = bfqq;
1883 -+}
1884 -+
1885 -+/*
1886 -+ * Get and set a new queue for service.
1887 -+ */
1888 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
1889 -+ struct bfq_queue *bfqq)
1890 -+{
1891 -+ if (!bfqq)
1892 -+ bfqq = bfq_get_next_queue(bfqd);
1893 -+ else
1894 -+ bfq_get_next_queue_forced(bfqd, bfqq);
1895 -+
1896 -+ __bfq_set_in_service_queue(bfqd, bfqq);
1897 -+ return bfqq;
1898 -+}
1899 -+
1900 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
1901 -+ struct request *rq)
1902 -+{
1903 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
1904 -+ return blk_rq_pos(rq) - bfqd->last_position;
1905 -+ else
1906 -+ return bfqd->last_position - blk_rq_pos(rq);
1907 -+}
1908 -+
1909 -+/*
1910 -+ * Return true if bfqq has no request pending and rq is close enough to
1911 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
1912 -+ * bfqq->next_rq
1913 -+ */
1914 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
1915 -+{
1916 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
1917 -+}
1918 -+
1919 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
1920 -+{
1921 -+ struct rb_root *root = &bfqd->rq_pos_tree;
1922 -+ struct rb_node *parent, *node;
1923 -+ struct bfq_queue *__bfqq;
1924 -+ sector_t sector = bfqd->last_position;
1925 -+
1926 -+ if (RB_EMPTY_ROOT(root))
1927 -+ return NULL;
1928 -+
1929 -+ /*
1930 -+ * First, if we find a request starting at the end of the last
1931 -+ * request, choose it.
1932 -+ */
1933 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1934 -+ if (__bfqq != NULL)
1935 -+ return __bfqq;
1936 -+
1937 -+ /*
1938 -+ * If the exact sector wasn't found, the parent of the NULL leaf
1939 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
1940 -+ * position).
1941 -+ */
1942 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1943 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1944 -+ return __bfqq;
1945 -+
1946 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
1947 -+ node = rb_next(&__bfqq->pos_node);
1948 -+ else
1949 -+ node = rb_prev(&__bfqq->pos_node);
1950 -+ if (node == NULL)
1951 -+ return NULL;
1952 -+
1953 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1954 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
1955 -+ return __bfqq;
1956 -+
1957 -+ return NULL;
1958 -+}
1959 -+
1960 -+/*
1961 -+ * bfqd - obvious
1962 -+ * cur_bfqq - passed in so that we don't decide that the current queue
1963 -+ * is closely cooperating with itself.
1964 -+ *
1965 -+ * We are assuming that cur_bfqq has dispatched at least one request,
1966 -+ * and that bfqd->last_position reflects a position on the disk associated
1967 -+ * with the I/O issued by cur_bfqq.
1968 -+ */
1969 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
1970 -+ struct bfq_queue *cur_bfqq)
1971 -+{
1972 -+ struct bfq_queue *bfqq;
1973 -+
1974 -+ if (bfq_class_idle(cur_bfqq))
1975 -+ return NULL;
1976 -+ if (!bfq_bfqq_sync(cur_bfqq))
1977 -+ return NULL;
1978 -+ if (BFQQ_SEEKY(cur_bfqq))
1979 -+ return NULL;
1980 -+
1981 -+ /* If device has only one backlogged bfq_queue, don't search. */
1982 -+ if (bfqd->busy_queues == 1)
1983 -+ return NULL;
1984 -+
1985 -+ /*
1986 -+ * We should notice if some of the queues are cooperating, e.g.
1987 -+ * working closely on the same area of the disk. In that case,
1988 -+ * we can group them together and don't waste time idling.
1989 -+ */
1990 -+ bfqq = bfqq_close(bfqd);
1991 -+ if (bfqq == NULL || bfqq == cur_bfqq)
1992 -+ return NULL;
1993 -+
1994 -+ /*
1995 -+ * Do not merge queues from different bfq_groups.
1996 -+ */
1997 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
1998 -+ return NULL;
1999 -+
2000 -+ /*
2001 -+ * It only makes sense to merge sync queues.
2002 -+ */
2003 -+ if (!bfq_bfqq_sync(bfqq))
2004 -+ return NULL;
2005 -+ if (BFQQ_SEEKY(bfqq))
2006 -+ return NULL;
2007 -+
2008 -+ /*
2009 -+ * Do not merge queues of different priority classes.
2010 -+ */
2011 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2012 -+ return NULL;
2013 -+
2014 -+ return bfqq;
2015 -+}
2016 -+
2017 -+/*
2018 -+ * If enough samples have been computed, return the current max budget
2019 -+ * stored in bfqd, which is dynamically updated according to the
2020 -+ * estimated disk peak rate; otherwise return the default max budget
2021 -+ */
2022 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2023 -+{
2024 -+ if (bfqd->budgets_assigned < 194)
2025 -+ return bfq_default_max_budget;
2026 -+ else
2027 -+ return bfqd->bfq_max_budget;
2028 -+}
2029 -+
2030 -+/*
2031 -+ * Return min budget, which is a fraction of the current or default
2032 -+ * max budget (trying with 1/32)
2033 -+ */
2034 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2035 -+{
2036 -+ if (bfqd->budgets_assigned < 194)
2037 -+ return bfq_default_max_budget / 32;
2038 -+ else
2039 -+ return bfqd->bfq_max_budget / 32;
2040 -+}
2041 -+
2042 -+/*
2043 -+ * Decides whether idling should be done for given device and
2044 -+ * given in-service queue.
2045 -+ */
2046 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2047 -+ struct bfq_queue *in_service_bfqq)
2048 -+{
2049 -+ if (in_service_bfqq == NULL)
2050 -+ return false;
2051 -+ /*
2052 -+ * If the device is non-rotational, and hence has no seek penalty,
2053 -+ * disable idling; but do so only if:
2054 -+ * - device does not support queuing, otherwise we still have
2055 -+ * a problem with sync vs async workloads;
2056 -+ * - the queue is not weight-raised, to preserve guarantees.
2057 -+ */
2058 -+ return blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2059 -+ (in_service_bfqq->raising_coeff == 1);
2060 -+}
2061 -+
2062 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2063 -+{
2064 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2065 -+ struct bfq_io_cq *bic;
2066 -+ unsigned long sl;
2067 -+
2068 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2069 -+
2070 -+ /* Tasks have exited, don't wait. */
2071 -+ bic = bfqd->in_service_bic;
2072 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2073 -+ return;
2074 -+
2075 -+ bfq_mark_bfqq_wait_request(bfqq);
2076 -+
2077 -+ /*
2078 -+ * We don't want to idle for seeks, but we do want to allow
2079 -+ * fair distribution of slice time for a process doing back-to-back
2080 -+ * seeks. So allow a little bit of time for him to submit a new rq.
2081 -+ *
2082 -+ * To prevent processes with (partly) seeky workloads from
2083 -+ * being too ill-treated, grant them a small fraction of the
2084 -+ * assigned budget before reducing the waiting time to
2085 -+ * BFQ_MIN_TT. This happened to help reduce latency.
2086 -+ */
2087 -+ sl = bfqd->bfq_slice_idle;
2088 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2089 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2090 -+ bfqq->raising_coeff == 1)
2091 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2092 -+ else if (bfqq->raising_coeff > 1)
2093 -+ sl = sl * 3;
2094 -+ bfqd->last_idling_start = ktime_get();
2095 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2096 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
2097 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2098 -+}
2099 -+
2100 -+/*
2101 -+ * Set the maximum time for the in-service queue to consume its
2102 -+ * budget. This prevents seeky processes from lowering the disk
2103 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
2104 -+ */
2105 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2106 -+{
2107 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2108 -+ unsigned int timeout_coeff;
2109 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2110 -+ timeout_coeff = 1;
2111 -+ else
2112 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2113 -+
2114 -+ bfqd->last_budget_start = ktime_get();
2115 -+
2116 -+ bfq_clear_bfqq_budget_new(bfqq);
2117 -+ bfqq->budget_timeout = jiffies +
2118 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2119 -+
2120 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2121 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2122 -+ timeout_coeff));
2123 -+}
2124 -+
2125 -+/*
2126 -+ * Move request from internal lists to the request queue dispatch list.
2127 -+ */
2128 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2129 -+{
2130 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2131 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2132 -+
2133 -+ bfq_remove_request(rq);
2134 -+ bfqq->dispatched++;
2135 -+ elv_dispatch_sort(q, rq);
2136 -+
2137 -+ if (bfq_bfqq_sync(bfqq))
2138 -+ bfqd->sync_flight++;
2139 -+}
2140 -+
2141 -+/*
2142 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
2143 -+ */
2144 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2145 -+{
2146 -+ struct request *rq = NULL;
2147 -+
2148 -+ if (bfq_bfqq_fifo_expire(bfqq))
2149 -+ return NULL;
2150 -+
2151 -+ bfq_mark_bfqq_fifo_expire(bfqq);
2152 -+
2153 -+ if (list_empty(&bfqq->fifo))
2154 -+ return NULL;
2155 -+
2156 -+ rq = rq_entry_fifo(bfqq->fifo.next);
2157 -+
2158 -+ if (time_before(jiffies, rq_fifo_time(rq)))
2159 -+ return NULL;
2160 -+
2161 -+ return rq;
2162 -+}
2163 -+
2164 -+/*
2165 -+ * Must be called with the queue_lock held.
2166 -+ */
2167 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
2168 -+{
2169 -+ int process_refs, io_refs;
2170 -+
2171 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2172 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2173 -+ BUG_ON(process_refs < 0);
2174 -+ return process_refs;
2175 -+}
2176 -+
2177 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2178 -+{
2179 -+ int process_refs, new_process_refs;
2180 -+ struct bfq_queue *__bfqq;
2181 -+
2182 -+ /*
2183 -+ * If there are no process references on the new_bfqq, then it is
2184 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2185 -+ * may have dropped their last reference (not just their last process
2186 -+ * reference).
2187 -+ */
2188 -+ if (!bfqq_process_refs(new_bfqq))
2189 -+ return;
2190 -+
2191 -+ /* Avoid a circular list and skip interim queue merges. */
2192 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
2193 -+ if (__bfqq == bfqq)
2194 -+ return;
2195 -+ new_bfqq = __bfqq;
2196 -+ }
2197 -+
2198 -+ process_refs = bfqq_process_refs(bfqq);
2199 -+ new_process_refs = bfqq_process_refs(new_bfqq);
2200 -+ /*
2201 -+ * If the process for the bfqq has gone away, there is no
2202 -+ * sense in merging the queues.
2203 -+ */
2204 -+ if (process_refs == 0 || new_process_refs == 0)
2205 -+ return;
2206 -+
2207 -+ /*
2208 -+ * Merge in the direction of the lesser amount of work.
2209 -+ */
2210 -+ if (new_process_refs >= process_refs) {
2211 -+ bfqq->new_bfqq = new_bfqq;
2212 -+ atomic_add(process_refs, &new_bfqq->ref);
2213 -+ } else {
2214 -+ new_bfqq->new_bfqq = bfqq;
2215 -+ atomic_add(new_process_refs, &bfqq->ref);
2216 -+ }
2217 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2218 -+ new_bfqq->pid);
2219 -+}
2220 -+
2221 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2222 -+{
2223 -+ struct bfq_entity *entity = &bfqq->entity;
2224 -+ return entity->budget - entity->service;
2225 -+}
2226 -+
2227 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2228 -+{
2229 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2230 -+
2231 -+ __bfq_bfqd_reset_in_service(bfqd);
2232 -+
2233 -+ /*
2234 -+ * If this bfqq is shared between multiple processes, check
2235 -+ * to make sure that those processes are still issuing I/Os
2236 -+ * within the mean seek distance. If not, it may be time to
2237 -+ * break the queues apart again.
2238 -+ */
2239 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2240 -+ bfq_mark_bfqq_split_coop(bfqq);
2241 -+
2242 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2243 -+ /*
2244 -+ * overloading budget_timeout field to store when
2245 -+ * the queue remains with no backlog, used by
2246 -+ * the weight-raising mechanism
2247 -+ */
2248 -+ bfqq->budget_timeout = jiffies;
2249 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2250 -+ } else {
2251 -+ bfq_activate_bfqq(bfqd, bfqq);
2252 -+ /*
2253 -+ * Resort priority tree of potential close cooperators.
2254 -+ */
2255 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
2256 -+ }
2257 -+}
2258 -+
2259 -+/**
2260 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2261 -+ * @bfqd: device data.
2262 -+ * @bfqq: queue to update.
2263 -+ * @reason: reason for expiration.
2264 -+ *
2265 -+ * Handle the feedback on @bfqq budget. See the body for detailed
2266 -+ * comments.
2267 -+ */
2268 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2269 -+ struct bfq_queue *bfqq,
2270 -+ enum bfqq_expiration reason)
2271 -+{
2272 -+ struct request *next_rq;
2273 -+ unsigned long budget, min_budget;
2274 -+
2275 -+ budget = bfqq->max_budget;
2276 -+ min_budget = bfq_min_budget(bfqd);
2277 -+
2278 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2279 -+
2280 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2281 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2282 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2283 -+ budget, bfq_min_budget(bfqd));
2284 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2285 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2286 -+
2287 -+ if (bfq_bfqq_sync(bfqq)) {
2288 -+ switch (reason) {
2289 -+ /*
2290 -+ * Caveat: in all the following cases we trade latency
2291 -+ * for throughput.
2292 -+ */
2293 -+ case BFQ_BFQQ_TOO_IDLE:
2294 -+ /*
2295 -+ * This is the only case where we may reduce
2296 -+ * the budget: if there is no request of the
2297 -+ * process still waiting for completion, then
2298 -+ * we assume (tentatively) that the timer has
2299 -+ * expired because the batch of requests of
2300 -+ * the process could have been served with a
2301 -+ * smaller budget. Hence, betting that
2302 -+ * process will behave in the same way when it
2303 -+ * becomes backlogged again, we reduce its
2304 -+ * next budget. As long as we guess right,
2305 -+ * this budget cut reduces the latency
2306 -+ * experienced by the process.
2307 -+ *
2308 -+ * However, if there are still outstanding
2309 -+ * requests, then the process may have not yet
2310 -+ * issued its next request just because it is
2311 -+ * still waiting for the completion of some of
2312 -+ * the still outstanding ones. So in this
2313 -+ * subcase we do not reduce its budget, on the
2314 -+ * contrary we increase it to possibly boost
2315 -+ * the throughput, as discussed in the
2316 -+ * comments to the BUDGET_TIMEOUT case.
2317 -+ */
2318 -+ if (bfqq->dispatched > 0) /* still outstanding reqs */
2319 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2320 -+ else {
2321 -+ if (budget > 5 * min_budget)
2322 -+ budget -= 4 * min_budget;
2323 -+ else
2324 -+ budget = min_budget;
2325 -+ }
2326 -+ break;
2327 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
2328 -+ /*
2329 -+ * We double the budget here because: 1) it
2330 -+ * gives the chance to boost the throughput if
2331 -+ * this is not a seeky process (which may have
2332 -+ * bumped into this timeout because of, e.g.,
2333 -+ * ZBR), 2) together with charge_full_budget
2334 -+ * it helps give seeky processes higher
2335 -+ * timestamps, and hence be served less
2336 -+ * frequently.
2337 -+ */
2338 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2339 -+ break;
2340 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2341 -+ /*
2342 -+ * The process still has backlog, and did not
2343 -+ * let either the budget timeout or the disk
2344 -+ * idling timeout expire. Hence it is not
2345 -+ * seeky, has a short thinktime and may be
2346 -+ * happy with a higher budget too. So
2347 -+ * definitely increase the budget of this good
2348 -+ * candidate to boost the disk throughput.
2349 -+ */
2350 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
2351 -+ break;
2352 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
2353 -+ /*
2354 -+ * Leave the budget unchanged.
2355 -+ */
2356 -+ default:
2357 -+ return;
2358 -+ }
2359 -+ } else /* async queue */
2360 -+ /* async queues get always the maximum possible budget
2361 -+ * (their ability to dispatch is limited by
2362 -+ * @bfqd->bfq_max_budget_async_rq).
2363 -+ */
2364 -+ budget = bfqd->bfq_max_budget;
2365 -+
2366 -+ bfqq->max_budget = budget;
2367 -+
2368 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2369 -+ bfqq->max_budget > bfqd->bfq_max_budget)
2370 -+ bfqq->max_budget = bfqd->bfq_max_budget;
2371 -+
2372 -+ /*
2373 -+ * Make sure that we have enough budget for the next request.
2374 -+ * Since the finish time of the bfqq must be kept in sync with
2375 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
2376 -+ * update.
2377 -+ */
2378 -+ next_rq = bfqq->next_rq;
2379 -+ if (next_rq != NULL)
2380 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2381 -+ bfq_serv_to_charge(next_rq, bfqq));
2382 -+ else
2383 -+ bfqq->entity.budget = bfqq->max_budget;
2384 -+
2385 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2386 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2387 -+ bfqq->entity.budget);
2388 -+}
2389 -+
2390 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2391 -+{
2392 -+ unsigned long max_budget;
2393 -+
2394 -+ /*
2395 -+ * The max_budget calculated when autotuning is equal to the
2396 -+ * amount of sectors transfered in timeout_sync at the
2397 -+ * estimated peak rate.
2398 -+ */
2399 -+ max_budget = (unsigned long)(peak_rate * 1000 *
2400 -+ timeout >> BFQ_RATE_SHIFT);
2401 -+
2402 -+ return max_budget;
2403 -+}
2404 -+
2405 -+/*
2406 -+ * In addition to updating the peak rate, checks whether the process
2407 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
2408 -+ * to the budget timeout, to reduce the amount of service provided to
2409 -+ * seeky processes, and hence reduce their chances to lower the
2410 -+ * throughput. See the code for more details.
2411 -+ */
2412 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2413 -+ int compensate, enum bfqq_expiration reason)
2414 -+{
2415 -+ u64 bw, usecs, expected, timeout;
2416 -+ ktime_t delta;
2417 -+ int update = 0;
2418 -+
2419 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2420 -+ return 0;
2421 -+
2422 -+ if (compensate)
2423 -+ delta = bfqd->last_idling_start;
2424 -+ else
2425 -+ delta = ktime_get();
2426 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
2427 -+ usecs = ktime_to_us(delta);
2428 -+
2429 -+ /* Don't trust short/unrealistic values. */
2430 -+ if (usecs < 100 || usecs >= LONG_MAX)
2431 -+ return 0;
2432 -+
2433 -+ /*
2434 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
2435 -+ * value to store the peak rate, in sectors per usec in fixed
2436 -+ * point math. We do so to have enough precision in the estimate
2437 -+ * and to avoid overflows.
2438 -+ */
2439 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2440 -+ do_div(bw, (unsigned long)usecs);
2441 -+
2442 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2443 -+
2444 -+ /*
2445 -+ * Use only long (> 20ms) intervals to filter out spikes for
2446 -+ * the peak rate estimation.
2447 -+ */
2448 -+ if (usecs > 20000) {
2449 -+ if (bw > bfqd->peak_rate ||
2450 -+ (!BFQQ_SEEKY(bfqq) &&
2451 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2452 -+ bfq_log(bfqd, "measured bw =%llu", bw);
2453 -+ /*
2454 -+ * To smooth oscillations use a low-pass filter with
2455 -+ * alpha=7/8, i.e.,
2456 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
2457 -+ */
2458 -+ do_div(bw, 8);
2459 -+ if (bw == 0)
2460 -+ return 0;
2461 -+ bfqd->peak_rate *= 7;
2462 -+ do_div(bfqd->peak_rate, 8);
2463 -+ bfqd->peak_rate += bw;
2464 -+ update = 1;
2465 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2466 -+ }
2467 -+
2468 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2469 -+
2470 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2471 -+ bfqd->peak_rate_samples++;
2472 -+
2473 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2474 -+ update && bfqd->bfq_user_max_budget == 0) {
2475 -+ bfqd->bfq_max_budget =
2476 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2477 -+ bfq_log(bfqd, "new max_budget=%lu",
2478 -+ bfqd->bfq_max_budget);
2479 -+ }
2480 -+ }
2481 -+
2482 -+ /*
2483 -+ * If the process has been served for a too short time
2484 -+ * interval to let its possible sequential accesses prevail on
2485 -+ * the initial seek time needed to move the disk head on the
2486 -+ * first sector it requested, then give the process a chance
2487 -+ * and for the moment return false.
2488 -+ */
2489 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2490 -+ return 0;
2491 -+
2492 -+ /*
2493 -+ * A process is considered ``slow'' (i.e., seeky, so that we
2494 -+ * cannot treat it fairly in the service domain, as it would
2495 -+ * slow down too much the other processes) if, when a slice
2496 -+ * ends for whatever reason, it has received service at a
2497 -+ * rate that would not be high enough to complete the budget
2498 -+ * before the budget timeout expiration.
2499 -+ */
2500 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2501 -+
2502 -+ /*
2503 -+ * Caveat: processes doing IO in the slower disk zones will
2504 -+ * tend to be slow(er) even if not seeky. And the estimated
2505 -+ * peak rate will actually be an average over the disk
2506 -+ * surface. Hence, to not be too harsh with unlucky processes,
2507 -+ * we keep a budget/3 margin of safety before declaring a
2508 -+ * process slow.
2509 -+ */
2510 -+ return expected > (4 * bfqq->entity.budget) / 3;
2511 -+}
2512 -+
2513 -+/*
2514 -+ * To be deemed as soft real-time, an application must meet two requirements.
2515 -+ * First, the application must not require an average bandwidth higher than
2516 -+ * the approximate bandwidth required to playback or record a compressed high-
2517 -+ * definition video.
2518 -+ * The next function is invoked on the completion of the last request of a
2519 -+ * batch, to compute the next-start time instant, soft_rt_next_start, such
2520 -+ * that, if the next request of the application does not arrive before
2521 -+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
2522 -+ *
2523 -+ * The second requirement is that the request pattern of the application is
2524 -+ * isochronous, i.e., that, after issuing a request or a batch of requests,
2525 -+ * the application stops issuing new requests until all its pending requests
2526 -+ * have been completed. After that, the application may issue a new batch,
2527 -+ * and so on.
2528 -+ * For this reason the next function is invoked to compute soft_rt_next_start
2529 -+ * only for applications that meet this requirement, whereas soft_rt_next_start
2530 -+ * is set to infinity for applications that do not.
2531 -+ *
2532 -+ * Unfortunately, even a greedy application may happen to behave in an
2533 -+ * isochronous way if the CPU load is high. In fact, the application may stop
2534 -+ * issuing requests while the CPUs are busy serving other processes, then
2535 -+ * restart, then stop again for a while, and so on. In addition, if the disk
2536 -+ * achieves a low enough throughput with the request pattern issued by the
2537 -+ * application (e.g., because the request pattern is random and/or the device
2538 -+ * is slow), then the application may meet the above bandwidth requirement too.
2539 -+ * To prevent such a greedy application to be deemed as soft real-time, a
2540 -+ * further rule is used in the computation of soft_rt_next_start:
2541 -+ * soft_rt_next_start must be higher than the current time plus the maximum
2542 -+ * time for which the arrival of a request is waited for when a sync queue
2543 -+ * becomes idle, namely bfqd->bfq_slice_idle.
2544 -+ * This filters out greedy applications, as the latter issue instead their next
2545 -+ * request as soon as possible after the last one has been completed (in
2546 -+ * contrast, when a batch of requests is completed, a soft real-time application
2547 -+ * spends some time processing data).
2548 -+ *
2549 -+ * Unfortunately, the last filter may easily generate false positives if only
2550 -+ * bfqd->bfq_slice_idle is used as a reference time interval and one or both
2551 -+ * the following cases occur:
2552 -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
2553 -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2554 -+ * HZ=100.
2555 -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2556 -+ * for a while, then suddenly 'jump' by several units to recover the lost
2557 -+ * increments. This seems to happen, e.g., inside virtual machines.
2558 -+ * To address this issue, we do not use as a reference time interval just
2559 -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2560 -+ * particular we add the minimum number of jiffies for which the filter seems
2561 -+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
2562 -+ */
2563 -+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2564 -+ struct bfq_queue *bfqq)
2565 -+{
2566 -+ return max(bfqq->last_idle_bklogged +
2567 -+ HZ * bfqq->service_from_backlogged /
2568 -+ bfqd->bfq_raising_max_softrt_rate,
2569 -+ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
2570 -+}
2571 -+
2572 -+/*
2573 -+ * Return the largest-possible time instant such that, for as long as possible,
2574 -+ * the current time will be lower than this time instant according to the macro
2575 -+ * time_is_before_jiffies().
2576 -+ */
2577 -+static inline unsigned long bfq_infinity_from_now(unsigned long now)
2578 -+{
2579 -+ return now + ULONG_MAX / 2;
2580 -+}
2581 -+
2582 -+/**
2583 -+ * bfq_bfqq_expire - expire a queue.
2584 -+ * @bfqd: device owning the queue.
2585 -+ * @bfqq: the queue to expire.
2586 -+ * @compensate: if true, compensate for the time spent idling.
2587 -+ * @reason: the reason causing the expiration.
2588 -+ *
2589 -+ *
2590 -+ * If the process associated to the queue is slow (i.e., seeky), or in
2591 -+ * case of budget timeout, or, finally, if it is async, we
2592 -+ * artificially charge it an entire budget (independently of the
2593 -+ * actual service it received). As a consequence, the queue will get
2594 -+ * higher timestamps than the correct ones upon reactivation, and
2595 -+ * hence it will be rescheduled as if it had received more service
2596 -+ * than what it actually received. In the end, this class of processes
2597 -+ * will receive less service in proportion to how slowly they consume
2598 -+ * their budgets (and hence how seriously they tend to lower the
2599 -+ * throughput).
2600 -+ *
2601 -+ * In contrast, when a queue expires because it has been idling for
2602 -+ * too much or because it exhausted its budget, we do not touch the
2603 -+ * amount of service it has received. Hence when the queue will be
2604 -+ * reactivated and its timestamps updated, the latter will be in sync
2605 -+ * with the actual service received by the queue until expiration.
2606 -+ *
2607 -+ * Charging a full budget to the first type of queues and the exact
2608 -+ * service to the others has the effect of using the WF2Q+ policy to
2609 -+ * schedule the former on a timeslice basis, without violating the
2610 -+ * service domain guarantees of the latter.
2611 -+ */
2612 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
2613 -+ struct bfq_queue *bfqq,
2614 -+ int compensate,
2615 -+ enum bfqq_expiration reason)
2616 -+{
2617 -+ int slow;
2618 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2619 -+
2620 -+ /* Update disk peak rate for autotuning and check whether the
2621 -+ * process is slow (see bfq_update_peak_rate).
2622 -+ */
2623 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2624 -+
2625 -+ /*
2626 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
2627 -+ * and async queues, to favor sequential sync workloads.
2628 -+ *
2629 -+ * Processes doing IO in the slower disk zones will tend to be
2630 -+ * slow(er) even if not seeky. Hence, since the estimated peak
2631 -+ * rate is actually an average over the disk surface, these
2632 -+ * processes may timeout just for bad luck. To avoid punishing
2633 -+ * them we do not charge a full budget to a process that
2634 -+ * succeeded in consuming at least 2/3 of its budget.
2635 -+ */
2636 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2637 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2638 -+ bfq_bfqq_charge_full_budget(bfqq);
2639 -+
2640 -+ bfqq->service_from_backlogged += bfqq->entity.service;
2641 -+
2642 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2643 -+ bfqq->last_rais_start_finish = jiffies;
2644 -+
2645 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&
2646 -+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
2647 -+ /*
2648 -+ * If we get here, and there are no outstanding requests,
2649 -+ * then the request pattern is isochronous (see the comments
2650 -+ * to the function bfq_bfqq_softrt_next_start()). Hence we can
2651 -+ * compute soft_rt_next_start. If, instead, the queue still
2652 -+ * has outstanding requests, then we have to wait for the
2653 -+ * completion of all the outstanding requests to discover
2654 -+ * whether the request pattern is actually isochronous.
2655 -+ */
2656 -+ if (bfqq->dispatched == 0)
2657 -+ bfqq->soft_rt_next_start =
2658 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
2659 -+ else {
2660 -+ /*
2661 -+ * The application is still waiting for the
2662 -+ * completion of one or more requests:
2663 -+ * prevent it from possibly being incorrectly
2664 -+ * deemed as soft real-time by setting its
2665 -+ * soft_rt_next_start to infinity. In fact,
2666 -+ * without this assignment, the application
2667 -+ * would be incorrectly deemed as soft
2668 -+ * real-time if:
2669 -+ * 1) it issued a new request before the
2670 -+ * completion of all its in-flight
2671 -+ * requests, and
2672 -+ * 2) at that time, its soft_rt_next_start
2673 -+ * happened to be in the past.
2674 -+ */
2675 -+ bfqq->soft_rt_next_start =
2676 -+ bfq_infinity_from_now(jiffies);
2677 -+ /*
2678 -+ * Schedule an update of soft_rt_next_start to when
2679 -+ * the task may be discovered to be isochronous.
2680 -+ */
2681 -+ bfq_mark_bfqq_softrt_update(bfqq);
2682 -+ }
2683 -+ }
2684 -+
2685 -+ bfq_log_bfqq(bfqd, bfqq,
2686 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2687 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2688 -+
2689 -+ /* Increase, decrease or leave budget unchanged according to reason */
2690 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2691 -+ __bfq_bfqq_expire(bfqd, bfqq);
2692 -+}
2693 -+
2694 -+/*
2695 -+ * Budget timeout is not implemented through a dedicated timer, but
2696 -+ * just checked on request arrivals and completions, as well as on
2697 -+ * idle timer expirations.
2698 -+ */
2699 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2700 -+{
2701 -+ if (bfq_bfqq_budget_new(bfqq))
2702 -+ return 0;
2703 -+
2704 -+ if (time_before(jiffies, bfqq->budget_timeout))
2705 -+ return 0;
2706 -+
2707 -+ return 1;
2708 -+}
2709 -+
2710 -+/*
2711 -+ * If we expire a queue that is waiting for the arrival of a new
2712 -+ * request, we may prevent the fictitious timestamp back-shifting that
2713 -+ * allows the guarantees of the queue to be preserved (see [1] for
2714 -+ * this tricky aspect). Hence we return true only if this condition
2715 -+ * does not hold, or if the queue is slow enough to deserve only to be
2716 -+ * kicked off for preserving a high throughput.
2717 -+*/
2718 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2719 -+{
2720 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
2721 -+ "may_budget_timeout: wr %d left %d timeout %d",
2722 -+ bfq_bfqq_wait_request(bfqq),
2723 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2724 -+ bfq_bfqq_budget_timeout(bfqq));
2725 -+
2726 -+ return (!bfq_bfqq_wait_request(bfqq) ||
2727 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2728 -+ &&
2729 -+ bfq_bfqq_budget_timeout(bfqq);
2730 -+}
2731 -+
2732 -+/*
2733 -+ * For weight-raised queues issuing sync requests, idling is always performed,
2734 -+ * as this is instrumental in guaranteeing a high fraction of the throughput
2735 -+ * to these queues, and hence in guaranteeing a lower latency for their
2736 -+ * requests. See [1] for details.
2737 -+ *
2738 -+ * For non-weight-raised queues, idling is instead disabled if the device is
2739 -+ * NCQ-enabled and non-rotational, as this boosts the throughput on such
2740 -+ * devices.
2741 -+ */
2742 -+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
2743 -+{
2744 -+ struct bfq_data *bfqd = bfqq->bfqd;
2745 -+
2746 -+ return bfq_bfqq_sync(bfqq) && (
2747 -+ bfqq->raising_coeff > 1 ||
2748 -+ (bfq_bfqq_idle_window(bfqq) &&
2749 -+ !(bfqd->hw_tag &&
2750 -+ (blk_queue_nonrot(bfqd->queue) ||
2751 -+ /*
2752 -+ * If there are weight-raised busy queues, then do not idle
2753 -+ * the disk for a sync non-weight-raised queue, and hence
2754 -+ * expire the queue immediately if empty. Combined with the
2755 -+ * timestamping rules of BFQ (see [1] for details), this
2756 -+ * causes sync non-weight-raised queues to get a lower
2757 -+ * fraction of the disk throughput, and hence reduces the rate
2758 -+ * at which the processes associated to these queues ask for
2759 -+ * requests from the request pool.
2760 -+ *
2761 -+ * This is beneficial for weight-raised processes, when the
2762 -+ * system operates in request-pool saturation conditions
2763 -+ * (e.g., in the presence of write hogs). In fact, if
2764 -+ * non-weight-raised processes ask for requests at a lower
2765 -+ * rate, then weight-raised processes have a higher
2766 -+ * probability to get a request from the pool immediately
2767 -+ * (or at least soon) when they need one. Hence they have a
2768 -+ * higher probability to actually get a fraction of the disk
2769 -+ * throughput proportional to their high weight. This is
2770 -+ * especially true with NCQ-enabled drives, which enqueue
2771 -+ * several requests in advance and further reorder
2772 -+ * internally-queued requests.
2773 -+ *
2774 -+ * Mistreating non-weight-raised queues in the above-described
2775 -+ * way, when there are busy weight-raised queues, seems to
2776 -+ * mitigate starvation problems in the presence of heavy write
2777 -+ * workloads and NCQ, and hence to guarantee a higher
2778 -+ * application and system responsiveness in these hostile
2779 -+ * scenarios.
2780 -+ */
2781 -+ bfqd->raised_busy_queues > 0)
2782 -+ )
2783 -+ )
2784 -+ );
2785 -+}
2786 -+
2787 -+/*
2788 -+ * If the in-service queue is empty, but it is sync and either of the following
2789 -+ * conditions holds, then: 1) the queue must remain in service and cannot be
2790 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
2791 -+ * of a new request for the queue. The conditions are:
2792 -+ * - the device is rotational and not performing NCQ, and the queue has its
2793 -+ * idle window set (in this case, waiting for a new request for the queue
2794 -+ * is likely to boost the disk throughput);
2795 -+ * - the queue is weight-raised (waiting for the request is necessary to
2796 -+ * provide the queue with fairness and latency guarantees, see [1] for
2797 -+ * details).
2798 -+ */
2799 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
2800 -+{
2801 -+ struct bfq_data *bfqd = bfqq->bfqd;
2802 -+
2803 -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
2804 -+ bfq_bfqq_must_not_expire(bfqq) &&
2805 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq);
2806 -+}
2807 -+
2808 -+/*
2809 -+ * Select a queue for service. If we have a current queue in service,
2810 -+ * check whether to continue servicing it, or retrieve and set a new one.
2811 -+ */
2812 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2813 -+{
2814 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
2815 -+ struct request *next_rq;
2816 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2817 -+
2818 -+ bfqq = bfqd->in_service_queue;
2819 -+ if (bfqq == NULL)
2820 -+ goto new_queue;
2821 -+
2822 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
2823 -+
2824 -+ /*
2825 -+ * If another queue has a request waiting within our mean seek
2826 -+ * distance, let it run. The expire code will check for close
2827 -+ * cooperators and put the close queue at the front of the
2828 -+ * service tree. If possible, merge the expiring queue with the
2829 -+ * new bfqq.
2830 -+ */
2831 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
2832 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
2833 -+ bfq_setup_merge(bfqq, new_bfqq);
2834 -+
2835 -+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
2836 -+ !timer_pending(&bfqd->idle_slice_timer) &&
2837 -+ !bfq_bfqq_must_idle(bfqq))
2838 -+ goto expire;
2839 -+
2840 -+ next_rq = bfqq->next_rq;
2841 -+ /*
2842 -+ * If bfqq has requests queued and it has enough budget left to
2843 -+ * serve them, keep the queue, otherwise expire it.
2844 -+ */
2845 -+ if (next_rq != NULL) {
2846 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
2847 -+ bfq_bfqq_budget_left(bfqq)) {
2848 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2849 -+ goto expire;
2850 -+ } else {
2851 -+ /*
2852 -+ * The idle timer may be pending because we may not
2853 -+ * disable disk idling even when a new request arrives
2854 -+ */
2855 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
2856 -+ /*
2857 -+ * If we get here: 1) at least a new request
2858 -+ * has arrived but we have not disabled the
2859 -+ * timer because the request was too small,
2860 -+ * 2) then the block layer has unplugged the
2861 -+ * device, causing the dispatch to be invoked.
2862 -+ *
2863 -+ * Since the device is unplugged, now the
2864 -+ * requests are probably large enough to
2865 -+ * provide a reasonable throughput.
2866 -+ * So we disable idling.
2867 -+ */
2868 -+ bfq_clear_bfqq_wait_request(bfqq);
2869 -+ del_timer(&bfqd->idle_slice_timer);
2870 -+ }
2871 -+ if (new_bfqq == NULL)
2872 -+ goto keep_queue;
2873 -+ else
2874 -+ goto expire;
2875 -+ }
2876 -+ }
2877 -+
2878 -+ /*
2879 -+ * No requests pending. If the in-service queue has no cooperator and
2880 -+ * still has requests in flight (possibly waiting for a completion)
2881 -+ * or is idling for a new request, then keep it.
2882 -+ */
2883 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
2884 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
2885 -+ bfqq = NULL;
2886 -+ goto keep_queue;
2887 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
2888 -+ /*
2889 -+ * Expiring the queue because there is a close cooperator,
2890 -+ * cancel timer.
2891 -+ */
2892 -+ bfq_clear_bfqq_wait_request(bfqq);
2893 -+ del_timer(&bfqd->idle_slice_timer);
2894 -+ }
2895 -+
2896 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2897 -+expire:
2898 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2899 -+new_queue:
2900 -+ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
2901 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
2902 -+ bfqq != NULL ? bfqq->pid : 0);
2903 -+keep_queue:
2904 -+ return bfqq;
2905 -+}
2906 -+
2907 -+static void bfq_update_raising_data(struct bfq_data *bfqd,
2908 -+ struct bfq_queue *bfqq)
2909 -+{
2910 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
2911 -+ struct bfq_entity *entity = &bfqq->entity;
2912 -+
2913 -+ bfq_log_bfqq(bfqd, bfqq,
2914 -+ "raising period dur %u/%u msec, "
2915 -+ "old raising coeff %u, w %d(%d)",
2916 -+ jiffies_to_msecs(jiffies -
2917 -+ bfqq->last_rais_start_finish),
2918 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
2919 -+ bfqq->raising_coeff,
2920 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
2921 -+
2922 -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
2923 -+ entity->orig_weight * bfqq->raising_coeff);
2924 -+ if (entity->ioprio_changed)
2925 -+ bfq_log_bfqq(bfqd, bfqq,
2926 -+ "WARN: pending prio change");
2927 -+ /*
2928 -+ * If too much time has elapsed from the beginning
2929 -+ * of this weight-raising, stop it.
2930 -+ */
2931 -+ if (time_is_before_jiffies(bfqq->last_rais_start_finish +
2932 -+ bfqq->raising_cur_max_time)) {
2933 -+ bfqq->last_rais_start_finish = jiffies;
2934 -+ bfq_log_bfqq(bfqd, bfqq,
2935 -+ "wrais ending at %lu, "
2936 -+ "rais_max_time %u",
2937 -+ bfqq->last_rais_start_finish,
2938 -+ jiffies_to_msecs(bfqq->
2939 -+ raising_cur_max_time));
2940 -+ bfq_bfqq_end_raising(bfqq);
2941 -+ __bfq_entity_update_weight_prio(
2942 -+ bfq_entity_service_tree(entity),
2943 -+ entity);
2944 -+ }
2945 -+ }
2946 -+}
2947 -+
2948 -+/*
2949 -+ * Dispatch one request from bfqq, moving it to the request queue
2950 -+ * dispatch list.
2951 -+ */
2952 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
2953 -+ struct bfq_queue *bfqq)
2954 -+{
2955 -+ int dispatched = 0;
2956 -+ struct request *rq;
2957 -+ unsigned long service_to_charge;
2958 -+
2959 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2960 -+
2961 -+ /* Follow expired path, else get first next available. */
2962 -+ rq = bfq_check_fifo(bfqq);
2963 -+ if (rq == NULL)
2964 -+ rq = bfqq->next_rq;
2965 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
2966 -+
2967 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2968 -+ /*
2969 -+ * This may happen if the next rq is chosen
2970 -+ * in fifo order instead of sector order.
2971 -+ * The budget is properly dimensioned
2972 -+ * to be always sufficient to serve the next request
2973 -+ * only if it is chosen in sector order. The reason is
2974 -+ * that it would be quite inefficient and little useful
2975 -+ * to always make sure that the budget is large enough
2976 -+ * to serve even the possible next rq in fifo order.
2977 -+ * In fact, requests are seldom served in fifo order.
2978 -+ *
2979 -+ * Expire the queue for budget exhaustion, and
2980 -+ * make sure that the next act_budget is enough
2981 -+ * to serve the next request, even if it comes
2982 -+ * from the fifo expired path.
2983 -+ */
2984 -+ bfqq->next_rq = rq;
2985 -+ /*
2986 -+ * Since this dispatch is failed, make sure that
2987 -+ * a new one will be performed
2988 -+ */
2989 -+ if (!bfqd->rq_in_driver)
2990 -+ bfq_schedule_dispatch(bfqd);
2991 -+ goto expire;
2992 -+ }
2993 -+
2994 -+ /* Finally, insert request into driver dispatch list. */
2995 -+ bfq_bfqq_served(bfqq, service_to_charge);
2996 -+ bfq_dispatch_insert(bfqd->queue, rq);
2997 -+
2998 -+ bfq_update_raising_data(bfqd, bfqq);
2999 -+
3000 -+ bfq_log_bfqq(bfqd, bfqq,
3001 -+ "dispatched %u sec req (%llu), budg left %lu",
3002 -+ blk_rq_sectors(rq),
3003 -+ (long long unsigned)blk_rq_pos(rq),
3004 -+ bfq_bfqq_budget_left(bfqq));
3005 -+
3006 -+ dispatched++;
3007 -+
3008 -+ if (bfqd->in_service_bic == NULL) {
3009 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3010 -+ bfqd->in_service_bic = RQ_BIC(rq);
3011 -+ }
3012 -+
3013 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
3014 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
3015 -+ bfq_class_idle(bfqq)))
3016 -+ goto expire;
3017 -+
3018 -+ return dispatched;
3019 -+
3020 -+expire:
3021 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
3022 -+ return dispatched;
3023 -+}
3024 -+
3025 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
3026 -+{
3027 -+ int dispatched = 0;
3028 -+
3029 -+ while (bfqq->next_rq != NULL) {
3030 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
3031 -+ dispatched++;
3032 -+ }
3033 -+
3034 -+ BUG_ON(!list_empty(&bfqq->fifo));
3035 -+ return dispatched;
3036 -+}
3037 -+
3038 -+/*
3039 -+ * Drain our current requests. Used for barriers and when switching
3040 -+ * io schedulers on-the-fly.
3041 -+ */
3042 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
3043 -+{
3044 -+ struct bfq_queue *bfqq, *n;
3045 -+ struct bfq_service_tree *st;
3046 -+ int dispatched = 0;
3047 -+
3048 -+ bfqq = bfqd->in_service_queue;
3049 -+ if (bfqq != NULL)
3050 -+ __bfq_bfqq_expire(bfqd, bfqq);
3051 -+
3052 -+ /*
3053 -+ * Loop through classes, and be careful to leave the scheduler
3054 -+ * in a consistent state, as feedback mechanisms and vtime
3055 -+ * updates cannot be disabled during the process.
3056 -+ */
3057 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3058 -+ st = bfq_entity_service_tree(&bfqq->entity);
3059 -+
3060 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3061 -+ bfqq->max_budget = bfq_max_budget(bfqd);
3062 -+
3063 -+ bfq_forget_idle(st);
3064 -+ }
3065 -+
3066 -+ BUG_ON(bfqd->busy_queues != 0);
3067 -+
3068 -+ return dispatched;
3069 -+}
3070 -+
3071 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
3072 -+{
3073 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3074 -+ struct bfq_queue *bfqq;
3075 -+ int max_dispatch;
3076 -+
3077 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3078 -+ if (bfqd->busy_queues == 0)
3079 -+ return 0;
3080 -+
3081 -+ if (unlikely(force))
3082 -+ return bfq_forced_dispatch(bfqd);
3083 -+
3084 -+ bfqq = bfq_select_queue(bfqd);
3085 -+ if (bfqq == NULL)
3086 -+ return 0;
3087 -+
3088 -+ max_dispatch = bfqd->bfq_quantum;
3089 -+ if (bfq_class_idle(bfqq))
3090 -+ max_dispatch = 1;
3091 -+
3092 -+ if (!bfq_bfqq_sync(bfqq))
3093 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
3094 -+
3095 -+ if (bfqq->dispatched >= max_dispatch) {
3096 -+ if (bfqd->busy_queues > 1)
3097 -+ return 0;
3098 -+ if (bfqq->dispatched >= 4 * max_dispatch)
3099 -+ return 0;
3100 -+ }
3101 -+
3102 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3103 -+ return 0;
3104 -+
3105 -+ bfq_clear_bfqq_wait_request(bfqq);
3106 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3107 -+
3108 -+ if (!bfq_dispatch_request(bfqd, bfqq))
3109 -+ return 0;
3110 -+
3111 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
3112 -+ bfqq->pid, max_dispatch);
3113 -+
3114 -+ return 1;
3115 -+}
3116 -+
3117 -+/*
3118 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
3119 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
3120 -+ *
3121 -+ * Queue lock must be held here.
3122 -+ */
3123 -+static void bfq_put_queue(struct bfq_queue *bfqq)
3124 -+{
3125 -+ struct bfq_data *bfqd = bfqq->bfqd;
3126 -+
3127 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3128 -+
3129 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3130 -+ atomic_read(&bfqq->ref));
3131 -+ if (!atomic_dec_and_test(&bfqq->ref))
3132 -+ return;
3133 -+
3134 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3135 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3136 -+ BUG_ON(bfqq->entity.tree != NULL);
3137 -+ BUG_ON(bfq_bfqq_busy(bfqq));
3138 -+ BUG_ON(bfqd->in_service_queue == bfqq);
3139 -+
3140 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3141 -+
3142 -+ kmem_cache_free(bfq_pool, bfqq);
3143 -+}
3144 -+
3145 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
3146 -+{
3147 -+ struct bfq_queue *__bfqq, *next;
3148 -+
3149 -+ /*
3150 -+ * If this queue was scheduled to merge with another queue, be
3151 -+ * sure to drop the reference taken on that queue (and others in
3152 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3153 -+ */
3154 -+ __bfqq = bfqq->new_bfqq;
3155 -+ while (__bfqq) {
3156 -+ if (__bfqq == bfqq) {
3157 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
3158 -+ break;
3159 -+ }
3160 -+ next = __bfqq->new_bfqq;
3161 -+ bfq_put_queue(__bfqq);
3162 -+ __bfqq = next;
3163 -+ }
3164 -+}
3165 -+
3166 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3167 -+{
3168 -+ if (bfqq == bfqd->in_service_queue) {
3169 -+ __bfq_bfqq_expire(bfqd, bfqq);
3170 -+ bfq_schedule_dispatch(bfqd);
3171 -+ }
3172 -+
3173 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3174 -+ atomic_read(&bfqq->ref));
3175 -+
3176 -+ bfq_put_cooperator(bfqq);
3177 -+
3178 -+ bfq_put_queue(bfqq);
3179 -+}
3180 -+
3181 -+static void bfq_init_icq(struct io_cq *icq)
3182 -+{
3183 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3184 -+
3185 -+ bic->ttime.last_end_request = jiffies;
3186 -+}
3187 -+
3188 -+static void bfq_exit_icq(struct io_cq *icq)
3189 -+{
3190 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3191 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
3192 -+
3193 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
3194 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3195 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
3196 -+ }
3197 -+
3198 -+ if (bic->bfqq[BLK_RW_SYNC]) {
3199 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3200 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
3201 -+ }
3202 -+}
3203 -+
3204 -+/*
3205 -+ * Update the entity prio values; note that the new values will not
3206 -+ * be used until the next (re)activation.
3207 -+ */
3208 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3209 -+{
3210 -+ struct task_struct *tsk = current;
3211 -+ int ioprio_class;
3212 -+
3213 -+ if (!bfq_bfqq_prio_changed(bfqq))
3214 -+ return;
3215 -+
3216 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3217 -+ switch (ioprio_class) {
3218 -+ default:
3219 -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
3220 -+ "bfq: bad prio %x\n", ioprio_class);
3221 -+ case IOPRIO_CLASS_NONE:
3222 -+ /*
3223 -+ * No prio set, inherit CPU scheduling settings.
3224 -+ */
3225 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3226 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3227 -+ break;
3228 -+ case IOPRIO_CLASS_RT:
3229 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3230 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3231 -+ break;
3232 -+ case IOPRIO_CLASS_BE:
3233 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3234 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3235 -+ break;
3236 -+ case IOPRIO_CLASS_IDLE:
3237 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3238 -+ bfqq->entity.new_ioprio = 7;
3239 -+ bfq_clear_bfqq_idle_window(bfqq);
3240 -+ break;
3241 -+ }
3242 -+
3243 -+ bfqq->entity.ioprio_changed = 1;
3244 -+
3245 -+ /*
3246 -+ * Keep track of original prio settings in case we have to temporarily
3247 -+ * elevate the priority of this queue.
3248 -+ */
3249 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3250 -+ bfq_clear_bfqq_prio_changed(bfqq);
3251 -+}
3252 -+
3253 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3254 -+{
3255 -+ struct bfq_data *bfqd;
3256 -+ struct bfq_queue *bfqq, *new_bfqq;
3257 -+ struct bfq_group *bfqg;
3258 -+ unsigned long uninitialized_var(flags);
3259 -+ int ioprio = bic->icq.ioc->ioprio;
3260 -+
3261 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
3262 -+ &flags);
3263 -+ /*
3264 -+ * This condition may trigger on a newly created bic, be sure to drop
3265 -+ * the lock before returning.
3266 -+ */
3267 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3268 -+ goto out;
3269 -+
3270 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
3271 -+ if (bfqq != NULL) {
3272 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3273 -+ sched_data);
3274 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3275 -+ GFP_ATOMIC);
3276 -+ if (new_bfqq != NULL) {
3277 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3278 -+ bfq_log_bfqq(bfqd, bfqq,
3279 -+ "changed_ioprio: bfqq %p %d",
3280 -+ bfqq, atomic_read(&bfqq->ref));
3281 -+ bfq_put_queue(bfqq);
3282 -+ }
3283 -+ }
3284 -+
3285 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
3286 -+ if (bfqq != NULL)
3287 -+ bfq_mark_bfqq_prio_changed(bfqq);
3288 -+
3289 -+ bic->ioprio = ioprio;
3290 -+
3291 -+out:
3292 -+ bfq_put_bfqd_unlock(bfqd, &flags);
3293 -+}
3294 -+
3295 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3296 -+ pid_t pid, int is_sync)
3297 -+{
3298 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3299 -+ INIT_LIST_HEAD(&bfqq->fifo);
3300 -+
3301 -+ atomic_set(&bfqq->ref, 0);
3302 -+ bfqq->bfqd = bfqd;
3303 -+
3304 -+ bfq_mark_bfqq_prio_changed(bfqq);
3305 -+
3306 -+ if (is_sync) {
3307 -+ if (!bfq_class_idle(bfqq))
3308 -+ bfq_mark_bfqq_idle_window(bfqq);
3309 -+ bfq_mark_bfqq_sync(bfqq);
3310 -+ }
3311 -+
3312 -+ /* Tentative initial value to trade off between thr and lat */
3313 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3314 -+ bfqq->pid = pid;
3315 -+
3316 -+ bfqq->raising_coeff = 1;
3317 -+ bfqq->last_rais_start_finish = 0;
3318 -+ /*
3319 -+ * Set to the value for which bfqq will not be deemed as
3320 -+ * soft rt when it becomes backlogged.
3321 -+ */
3322 -+ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
3323 -+}
3324 -+
3325 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3326 -+ struct bfq_group *bfqg,
3327 -+ int is_sync,
3328 -+ struct bfq_io_cq *bic,
3329 -+ gfp_t gfp_mask)
3330 -+{
3331 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3332 -+
3333 -+retry:
3334 -+ /* bic always exists here */
3335 -+ bfqq = bic_to_bfqq(bic, is_sync);
3336 -+
3337 -+ /*
3338 -+ * Always try a new alloc if we fall back to the OOM bfqq
3339 -+ * originally, since it should just be a temporary situation.
3340 -+ */
3341 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3342 -+ bfqq = NULL;
3343 -+ if (new_bfqq != NULL) {
3344 -+ bfqq = new_bfqq;
3345 -+ new_bfqq = NULL;
3346 -+ } else if (gfp_mask & __GFP_WAIT) {
3347 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3348 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3349 -+ gfp_mask | __GFP_ZERO,
3350 -+ bfqd->queue->node);
3351 -+ spin_lock_irq(bfqd->queue->queue_lock);
3352 -+ if (new_bfqq != NULL)
3353 -+ goto retry;
3354 -+ } else {
3355 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
3356 -+ gfp_mask | __GFP_ZERO,
3357 -+ bfqd->queue->node);
3358 -+ }
3359 -+
3360 -+ if (bfqq != NULL) {
3361 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3362 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
3363 -+ } else {
3364 -+ bfqq = &bfqd->oom_bfqq;
3365 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3366 -+ }
3367 -+
3368 -+ bfq_init_prio_data(bfqq, bic);
3369 -+ bfq_init_entity(&bfqq->entity, bfqg);
3370 -+ }
3371 -+
3372 -+ if (new_bfqq != NULL)
3373 -+ kmem_cache_free(bfq_pool, new_bfqq);
3374 -+
3375 -+ return bfqq;
3376 -+}
3377 -+
3378 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3379 -+ struct bfq_group *bfqg,
3380 -+ int ioprio_class, int ioprio)
3381 -+{
3382 -+ switch (ioprio_class) {
3383 -+ case IOPRIO_CLASS_RT:
3384 -+ return &bfqg->async_bfqq[0][ioprio];
3385 -+ case IOPRIO_CLASS_NONE:
3386 -+ ioprio = IOPRIO_NORM;
3387 -+ /* fall through */
3388 -+ case IOPRIO_CLASS_BE:
3389 -+ return &bfqg->async_bfqq[1][ioprio];
3390 -+ case IOPRIO_CLASS_IDLE:
3391 -+ return &bfqg->async_idle_bfqq;
3392 -+ default:
3393 -+ BUG();
3394 -+ }
3395 -+}
3396 -+
3397 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3398 -+ struct bfq_group *bfqg, int is_sync,
3399 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
3400 -+{
3401 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3402 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3403 -+ struct bfq_queue **async_bfqq = NULL;
3404 -+ struct bfq_queue *bfqq = NULL;
3405 -+
3406 -+ if (!is_sync) {
3407 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3408 -+ ioprio);
3409 -+ bfqq = *async_bfqq;
3410 -+ }
3411 -+
3412 -+ if (bfqq == NULL)
3413 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3414 -+
3415 -+ /*
3416 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
3417 -+ */
3418 -+ if (!is_sync && *async_bfqq == NULL) {
3419 -+ atomic_inc(&bfqq->ref);
3420 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3421 -+ bfqq, atomic_read(&bfqq->ref));
3422 -+ *async_bfqq = bfqq;
3423 -+ }
3424 -+
3425 -+ atomic_inc(&bfqq->ref);
3426 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3427 -+ atomic_read(&bfqq->ref));
3428 -+ return bfqq;
3429 -+}
3430 -+
3431 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3432 -+ struct bfq_io_cq *bic)
3433 -+{
3434 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3435 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3436 -+
3437 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3438 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3439 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
3440 -+ bic->ttime.ttime_samples;
3441 -+}
3442 -+
3443 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
3444 -+ struct bfq_queue *bfqq,
3445 -+ struct request *rq)
3446 -+{
3447 -+ sector_t sdist;
3448 -+ u64 total;
3449 -+
3450 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
3451 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3452 -+ else
3453 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3454 -+
3455 -+ /*
3456 -+ * Don't allow the seek distance to get too large from the
3457 -+ * odd fragment, pagein, etc.
3458 -+ */
3459 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3460 -+ sdist = 0;
3461 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
3462 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3463 -+ else
3464 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3465 -+
3466 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3467 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3468 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
3469 -+ do_div(total, bfqq->seek_samples);
3470 -+ bfqq->seek_mean = (sector_t)total;
3471 -+
3472 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3473 -+ (u64)bfqq->seek_mean);
3474 -+}
3475 -+
3476 -+/*
3477 -+ * Disable idle window if the process thinks too long or seeks so much that
3478 -+ * it doesn't matter.
3479 -+ */
3480 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
3481 -+ struct bfq_queue *bfqq,
3482 -+ struct bfq_io_cq *bic)
3483 -+{
3484 -+ int enable_idle;
3485 -+
3486 -+ /* Don't idle for async or idle io prio class. */
3487 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3488 -+ return;
3489 -+
3490 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
3491 -+
3492 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3493 -+ bfqd->bfq_slice_idle == 0 ||
3494 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3495 -+ bfqq->raising_coeff == 1))
3496 -+ enable_idle = 0;
3497 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3498 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3499 -+ bfqq->raising_coeff == 1)
3500 -+ enable_idle = 0;
3501 -+ else
3502 -+ enable_idle = 1;
3503 -+ }
3504 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3505 -+ enable_idle);
3506 -+
3507 -+ if (enable_idle)
3508 -+ bfq_mark_bfqq_idle_window(bfqq);
3509 -+ else
3510 -+ bfq_clear_bfqq_idle_window(bfqq);
3511 -+}
3512 -+
3513 -+/*
3514 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
3515 -+ * something we should do about it.
3516 -+ */
3517 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3518 -+ struct request *rq)
3519 -+{
3520 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
3521 -+
3522 -+ if (rq->cmd_flags & REQ_META)
3523 -+ bfqq->meta_pending++;
3524 -+
3525 -+ bfq_update_io_thinktime(bfqd, bic);
3526 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
3527 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3528 -+ !BFQQ_SEEKY(bfqq))
3529 -+ bfq_update_idle_window(bfqd, bfqq, bic);
3530 -+
3531 -+ bfq_log_bfqq(bfqd, bfqq,
3532 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3533 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3534 -+ (long long unsigned)bfqq->seek_mean);
3535 -+
3536 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3537 -+
3538 -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3539 -+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3540 -+ blk_rq_sectors(rq) < 32;
3541 -+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3542 -+
3543 -+ /*
3544 -+ * There is just this request queued: if the request
3545 -+ * is small and the queue is not to be expired, then
3546 -+ * just exit.
3547 -+ *
3548 -+ * In this way, if the disk is being idled to wait for
3549 -+ * a new request from the in-service queue, we avoid
3550 -+ * unplugging the device and committing the disk to serve
3551 -+ * just a small request. On the contrary, we wait for
3552 -+ * the block layer to decide when to unplug the device:
3553 -+ * hopefully, new requests will be merged to this one
3554 -+ * quickly, then the device will be unplugged and
3555 -+ * larger requests will be dispatched.
3556 -+ */
3557 -+ if (small_req && !budget_timeout)
3558 -+ return;
3559 -+
3560 -+ /*
3561 -+ * A large enough request arrived, or the queue is to
3562 -+ * be expired: in both cases disk idling is to be
3563 -+ * stopped, so clear wait_request flag and reset
3564 -+ * timer.
3565 -+ */
3566 -+ bfq_clear_bfqq_wait_request(bfqq);
3567 -+ del_timer(&bfqd->idle_slice_timer);
3568 -+
3569 -+ /*
3570 -+ * The queue is not empty, because a new request just
3571 -+ * arrived. Hence we can safely expire the queue, in
3572 -+ * case of budget timeout, without risking that the
3573 -+ * timestamps of the queue are not updated correctly.
3574 -+ * See [1] for more details.
3575 -+ */
3576 -+ if (budget_timeout)
3577 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3578 -+
3579 -+ /*
3580 -+ * Let the request rip immediately, or let a new queue be
3581 -+ * selected if bfqq has just been expired.
3582 -+ */
3583 -+ __blk_run_queue(bfqd->queue);
3584 -+ }
3585 -+}
3586 -+
3587 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
3588 -+{
3589 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3590 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3591 -+
3592 -+ assert_spin_locked(bfqd->queue->queue_lock);
3593 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3594 -+
3595 -+ bfq_add_rq_rb(rq);
3596 -+
3597 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3598 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
3599 -+
3600 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
3601 -+}
3602 -+
3603 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
3604 -+{
3605 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3606 -+ bfqd->rq_in_driver);
3607 -+
3608 -+ if (bfqd->hw_tag == 1)
3609 -+ return;
3610 -+
3611 -+ /*
3612 -+ * This sample is valid if the number of outstanding requests
3613 -+ * is large enough to allow a queueing behavior. Note that the
3614 -+ * sum is not exact, as it's not taking into account deactivated
3615 -+ * requests.
3616 -+ */
3617 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3618 -+ return;
3619 -+
3620 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3621 -+ return;
3622 -+
3623 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3624 -+ bfqd->max_rq_in_driver = 0;
3625 -+ bfqd->hw_tag_samples = 0;
3626 -+}
3627 -+
3628 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
3629 -+{
3630 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3631 -+ struct bfq_data *bfqd = bfqq->bfqd;
3632 -+ const int sync = rq_is_sync(rq);
3633 -+
3634 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3635 -+ blk_rq_sectors(rq), sync);
3636 -+
3637 -+ bfq_update_hw_tag(bfqd);
3638 -+
3639 -+ WARN_ON(!bfqd->rq_in_driver);
3640 -+ WARN_ON(!bfqq->dispatched);
3641 -+ bfqd->rq_in_driver--;
3642 -+ bfqq->dispatched--;
3643 -+
3644 -+ if (bfq_bfqq_sync(bfqq))
3645 -+ bfqd->sync_flight--;
3646 -+
3647 -+ if (sync)
3648 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3649 -+
3650 -+ /*
3651 -+ * If we are waiting to discover whether the request pattern of the
3652 -+ * task associated with the queue is actually isochronous, and
3653 -+ * both requisites for this condition to hold are satisfied, then
3654 -+ * compute soft_rt_next_start (see the comments to the function
3655 -+ * bfq_bfqq_softrt_next_start()).
3656 -+ */
3657 -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
3658 -+ RB_EMPTY_ROOT(&bfqq->sort_list))
3659 -+ bfqq->soft_rt_next_start =
3660 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
3661 -+
3662 -+ /*
3663 -+ * If this is the in-service queue, check if it needs to be expired,
3664 -+ * or if we want to idle in case it has no pending requests.
3665 -+ */
3666 -+ if (bfqd->in_service_queue == bfqq) {
3667 -+ if (bfq_bfqq_budget_new(bfqq))
3668 -+ bfq_set_budget_timeout(bfqd);
3669 -+
3670 -+ if (bfq_bfqq_must_idle(bfqq)) {
3671 -+ bfq_arm_slice_timer(bfqd);
3672 -+ goto out;
3673 -+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
3674 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3675 -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3676 -+ (bfqq->dispatched == 0 ||
3677 -+ !bfq_bfqq_must_not_expire(bfqq)))
3678 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
3679 -+ BFQ_BFQQ_NO_MORE_REQUESTS);
3680 -+ }
3681 -+
3682 -+ if (!bfqd->rq_in_driver)
3683 -+ bfq_schedule_dispatch(bfqd);
3684 -+
3685 -+out:
3686 -+ return;
3687 -+}
3688 -+
3689 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3690 -+{
3691 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3692 -+ bfq_clear_bfqq_must_alloc(bfqq);
3693 -+ return ELV_MQUEUE_MUST;
3694 -+ }
3695 -+
3696 -+ return ELV_MQUEUE_MAY;
3697 -+}
3698 -+
3699 -+static int bfq_may_queue(struct request_queue *q, int rw)
3700 -+{
3701 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3702 -+ struct task_struct *tsk = current;
3703 -+ struct bfq_io_cq *bic;
3704 -+ struct bfq_queue *bfqq;
3705 -+
3706 -+ /*
3707 -+ * Don't force setup of a queue from here, as a call to may_queue
3708 -+ * does not necessarily imply that a request actually will be queued.
3709 -+ * So just lookup a possibly existing queue, or return 'may queue'
3710 -+ * if that fails.
3711 -+ */
3712 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3713 -+ if (bic == NULL)
3714 -+ return ELV_MQUEUE_MAY;
3715 -+
3716 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3717 -+ if (bfqq != NULL) {
3718 -+ bfq_init_prio_data(bfqq, bic);
3719 -+
3720 -+ return __bfq_may_queue(bfqq);
3721 -+ }
3722 -+
3723 -+ return ELV_MQUEUE_MAY;
3724 -+}
3725 -+
3726 -+/*
3727 -+ * Queue lock held here.
3728 -+ */
3729 -+static void bfq_put_request(struct request *rq)
3730 -+{
3731 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3732 -+
3733 -+ if (bfqq != NULL) {
3734 -+ const int rw = rq_data_dir(rq);
3735 -+
3736 -+ BUG_ON(!bfqq->allocated[rw]);
3737 -+ bfqq->allocated[rw]--;
3738 -+
3739 -+ rq->elv.priv[0] = NULL;
3740 -+ rq->elv.priv[1] = NULL;
3741 -+
3742 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3743 -+ bfqq, atomic_read(&bfqq->ref));
3744 -+ bfq_put_queue(bfqq);
3745 -+ }
3746 -+}
3747 -+
3748 -+static struct bfq_queue *
3749 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3750 -+ struct bfq_queue *bfqq)
3751 -+{
3752 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3753 -+ (long unsigned)bfqq->new_bfqq->pid);
3754 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3755 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3756 -+ bfq_put_queue(bfqq);
3757 -+ return bic_to_bfqq(bic, 1);
3758 -+}
3759 -+
3760 -+/*
3761 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3762 -+ * was the last process referring to said bfqq.
3763 -+ */
3764 -+static struct bfq_queue *
3765 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3766 -+{
3767 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3768 -+ if (bfqq_process_refs(bfqq) == 1) {
3769 -+ bfqq->pid = current->pid;
3770 -+ bfq_clear_bfqq_coop(bfqq);
3771 -+ bfq_clear_bfqq_split_coop(bfqq);
3772 -+ return bfqq;
3773 -+ }
3774 -+
3775 -+ bic_set_bfqq(bic, NULL, 1);
3776 -+
3777 -+ bfq_put_cooperator(bfqq);
3778 -+
3779 -+ bfq_put_queue(bfqq);
3780 -+ return NULL;
3781 -+}
3782 -+
3783 -+/*
3784 -+ * Allocate bfq data structures associated with this request.
3785 -+ */
3786 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
3787 -+ struct bio *bio, gfp_t gfp_mask)
3788 -+{
3789 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3790 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3791 -+ const int rw = rq_data_dir(rq);
3792 -+ const int is_sync = rq_is_sync(rq);
3793 -+ struct bfq_queue *bfqq;
3794 -+ struct bfq_group *bfqg;
3795 -+ unsigned long flags;
3796 -+
3797 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
3798 -+
3799 -+ bfq_changed_ioprio(bic);
3800 -+
3801 -+ spin_lock_irqsave(q->queue_lock, flags);
3802 -+
3803 -+ if (bic == NULL)
3804 -+ goto queue_fail;
3805 -+
3806 -+ bfqg = bfq_bic_update_cgroup(bic);
3807 -+
3808 -+new_queue:
3809 -+ bfqq = bic_to_bfqq(bic, is_sync);
3810 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3811 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3812 -+ bic_set_bfqq(bic, bfqq, is_sync);
3813 -+ } else {
3814 -+ /*
3815 -+ * If the queue was seeky for too long, break it apart.
3816 -+ */
3817 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3818 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3819 -+ bfqq = bfq_split_bfqq(bic, bfqq);
3820 -+ if (!bfqq)
3821 -+ goto new_queue;
3822 -+ }
3823 -+
3824 -+ /*
3825 -+ * Check to see if this queue is scheduled to merge with
3826 -+ * another closely cooperating queue. The merging of queues
3827 -+ * happens here as it must be done in process context.
3828 -+ * The reference on new_bfqq was taken in merge_bfqqs.
3829 -+ */
3830 -+ if (bfqq->new_bfqq != NULL)
3831 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
3832 -+ }
3833 -+
3834 -+ bfqq->allocated[rw]++;
3835 -+ atomic_inc(&bfqq->ref);
3836 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3837 -+ atomic_read(&bfqq->ref));
3838 -+
3839 -+ rq->elv.priv[0] = bic;
3840 -+ rq->elv.priv[1] = bfqq;
3841 -+
3842 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3843 -+
3844 -+ return 0;
3845 -+
3846 -+queue_fail:
3847 -+ bfq_schedule_dispatch(bfqd);
3848 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3849 -+
3850 -+ return 1;
3851 -+}
3852 -+
3853 -+static void bfq_kick_queue(struct work_struct *work)
3854 -+{
3855 -+ struct bfq_data *bfqd =
3856 -+ container_of(work, struct bfq_data, unplug_work);
3857 -+ struct request_queue *q = bfqd->queue;
3858 -+
3859 -+ spin_lock_irq(q->queue_lock);
3860 -+ __blk_run_queue(q);
3861 -+ spin_unlock_irq(q->queue_lock);
3862 -+}
3863 -+
3864 -+/*
3865 -+ * Handler of the expiration of the timer running if the in-service queue
3866 -+ * is idling inside its time slice.
3867 -+ */
3868 -+static void bfq_idle_slice_timer(unsigned long data)
3869 -+{
3870 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
3871 -+ struct bfq_queue *bfqq;
3872 -+ unsigned long flags;
3873 -+ enum bfqq_expiration reason;
3874 -+
3875 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3876 -+
3877 -+ bfqq = bfqd->in_service_queue;
3878 -+ /*
3879 -+ * Theoretical race here: the in-service queue can be NULL or different
3880 -+ * from the queue that was idling if the timer handler spins on
3881 -+ * the queue_lock and a new request arrives for the current
3882 -+ * queue and there is a full dispatch cycle that changes the
3883 -+ * in-service queue. This can hardly happen, but in the worst case
3884 -+ * we just expire a queue too early.
3885 -+ */
3886 -+ if (bfqq != NULL) {
3887 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3888 -+ if (bfq_bfqq_budget_timeout(bfqq))
3889 -+ /*
3890 -+ * Also here the queue can be safely expired
3891 -+ * for budget timeout without wasting
3892 -+ * guarantees
3893 -+ */
3894 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3895 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3896 -+ /*
3897 -+ * The queue may not be empty upon timer expiration,
3898 -+ * because we may not disable the timer when the first
3899 -+ * request of the in-service queue arrives during
3900 -+ * disk idling
3901 -+ */
3902 -+ reason = BFQ_BFQQ_TOO_IDLE;
3903 -+ else
3904 -+ goto schedule_dispatch;
3905 -+
3906 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3907 -+ }
3908 -+
3909 -+schedule_dispatch:
3910 -+ bfq_schedule_dispatch(bfqd);
3911 -+
3912 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3913 -+}
3914 -+
3915 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3916 -+{
3917 -+ del_timer_sync(&bfqd->idle_slice_timer);
3918 -+ cancel_work_sync(&bfqd->unplug_work);
3919 -+}
3920 -+
3921 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3922 -+ struct bfq_queue **bfqq_ptr)
3923 -+{
3924 -+ struct bfq_group *root_group = bfqd->root_group;
3925 -+ struct bfq_queue *bfqq = *bfqq_ptr;
3926 -+
3927 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3928 -+ if (bfqq != NULL) {
3929 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3930 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3931 -+ bfqq, atomic_read(&bfqq->ref));
3932 -+ bfq_put_queue(bfqq);
3933 -+ *bfqq_ptr = NULL;
3934 -+ }
3935 -+}
3936 -+
3937 -+/*
3938 -+ * Release all the bfqg references to its async queues. If we are
3939 -+ * deallocating the group these queues may still contain requests, so
3940 -+ * we reparent them to the root cgroup (i.e., the only one that will
3941 -+ * exist for sure until all the requests on a device are gone).
3942 -+ */
3943 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3944 -+{
3945 -+ int i, j;
3946 -+
3947 -+ for (i = 0; i < 2; i++)
3948 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
3949 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3950 -+
3951 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3952 -+}
3953 -+
3954 -+static void bfq_exit_queue(struct elevator_queue *e)
3955 -+{
3956 -+ struct bfq_data *bfqd = e->elevator_data;
3957 -+ struct request_queue *q = bfqd->queue;
3958 -+ struct bfq_queue *bfqq, *n;
3959 -+
3960 -+ bfq_shutdown_timer_wq(bfqd);
3961 -+
3962 -+ spin_lock_irq(q->queue_lock);
3963 -+
3964 -+ BUG_ON(bfqd->in_service_queue != NULL);
3965 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3966 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
3967 -+
3968 -+ bfq_disconnect_groups(bfqd);
3969 -+ spin_unlock_irq(q->queue_lock);
3970 -+
3971 -+ bfq_shutdown_timer_wq(bfqd);
3972 -+
3973 -+ synchronize_rcu();
3974 -+
3975 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3976 -+
3977 -+ bfq_free_root_group(bfqd);
3978 -+ kfree(bfqd);
3979 -+}
3980 -+
3981 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3982 -+{
3983 -+ struct bfq_group *bfqg;
3984 -+ struct bfq_data *bfqd;
3985 -+ struct elevator_queue *eq;
3986 -+
3987 -+ eq = elevator_alloc(q, e);
3988 -+ if (eq == NULL)
3989 -+ return -ENOMEM;
3990 -+
3991 -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
3992 -+ if (bfqd == NULL) {
3993 -+ kobject_put(&eq->kobj);
3994 -+ return -ENOMEM;
3995 -+ }
3996 -+ eq->elevator_data = bfqd;
3997 -+
3998 -+ /*
3999 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4000 -+ * Grab a permanent reference to it, so that the normal code flow
4001 -+ * will not attempt to free it.
4002 -+ */
4003 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
4004 -+ atomic_inc(&bfqd->oom_bfqq.ref);
4005 -+
4006 -+ bfqd->queue = q;
4007 -+
4008 -+ spin_lock_irq(q->queue_lock);
4009 -+ q->elevator = eq;
4010 -+ spin_unlock_irq(q->queue_lock);
4011 -+
4012 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
4013 -+ if (bfqg == NULL) {
4014 -+ kfree(bfqd);
4015 -+ kobject_put(&eq->kobj);
4016 -+ return -ENOMEM;
4017 -+ }
4018 -+
4019 -+ bfqd->root_group = bfqg;
4020 -+
4021 -+ init_timer(&bfqd->idle_slice_timer);
4022 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4023 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
4024 -+
4025 -+ bfqd->rq_pos_tree = RB_ROOT;
4026 -+
4027 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
4028 -+
4029 -+ INIT_LIST_HEAD(&bfqd->active_list);
4030 -+ INIT_LIST_HEAD(&bfqd->idle_list);
4031 -+
4032 -+ bfqd->hw_tag = -1;
4033 -+
4034 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
4035 -+
4036 -+ bfqd->bfq_quantum = bfq_quantum;
4037 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4038 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4039 -+ bfqd->bfq_back_max = bfq_back_max;
4040 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
4041 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
4042 -+ bfqd->bfq_class_idle_last_service = 0;
4043 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
4044 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
4045 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4046 -+
4047 -+ bfqd->low_latency = true;
4048 -+
4049 -+ bfqd->bfq_raising_coeff = 20;
4050 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
4051 -+ bfqd->bfq_raising_max_time = 0;
4052 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
4053 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
4054 -+ bfqd->bfq_raising_max_softrt_rate = 7000; /*
4055 -+ * Approximate rate required
4056 -+ * to playback or record a
4057 -+ * high-definition compressed
4058 -+ * video.
4059 -+ */
4060 -+ bfqd->raised_busy_queues = 0;
4061 -+
4062 -+ /* Initially estimate the device's peak rate as the reference rate */
4063 -+ if (blk_queue_nonrot(bfqd->queue)) {
4064 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
4065 -+ bfqd->peak_rate = R_nonrot;
4066 -+ } else {
4067 -+ bfqd->RT_prod = R_rot * T_rot;
4068 -+ bfqd->peak_rate = R_rot;
4069 -+ }
4070 -+
4071 -+ return 0;
4072 -+}
4073 -+
4074 -+static void bfq_slab_kill(void)
4075 -+{
4076 -+ if (bfq_pool != NULL)
4077 -+ kmem_cache_destroy(bfq_pool);
4078 -+}
4079 -+
4080 -+static int __init bfq_slab_setup(void)
4081 -+{
4082 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4083 -+ if (bfq_pool == NULL)
4084 -+ return -ENOMEM;
4085 -+ return 0;
4086 -+}
4087 -+
4088 -+static ssize_t bfq_var_show(unsigned int var, char *page)
4089 -+{
4090 -+ return sprintf(page, "%d\n", var);
4091 -+}
4092 -+
4093 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
4094 -+{
4095 -+ unsigned long new_val;
4096 -+ int ret = kstrtoul(page, 10, &new_val);
4097 -+
4098 -+ if (ret == 0)
4099 -+ *var = new_val;
4100 -+
4101 -+ return count;
4102 -+}
4103 -+
4104 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
4105 -+{
4106 -+ struct bfq_data *bfqd = e->elevator_data;
4107 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
4108 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
4109 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
4110 -+}
4111 -+
4112 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4113 -+{
4114 -+ struct bfq_queue *bfqq;
4115 -+ struct bfq_data *bfqd = e->elevator_data;
4116 -+ ssize_t num_char = 0;
4117 -+
4118 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4119 -+ bfqd->queued);
4120 -+
4121 -+ spin_lock_irq(bfqd->queue->queue_lock);
4122 -+
4123 -+ num_char += sprintf(page + num_char, "Active:\n");
4124 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4125 -+ num_char += sprintf(page + num_char,
4126 -+ "pid%d: weight %hu, nr_queued %d %d,"
4127 -+ " dur %d/%u\n",
4128 -+ bfqq->pid,
4129 -+ bfqq->entity.weight,
4130 -+ bfqq->queued[0],
4131 -+ bfqq->queued[1],
4132 -+ jiffies_to_msecs(jiffies -
4133 -+ bfqq->last_rais_start_finish),
4134 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4135 -+ }
4136 -+
4137 -+ num_char += sprintf(page + num_char, "Idle:\n");
4138 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4139 -+ num_char += sprintf(page + num_char,
4140 -+ "pid%d: weight %hu, dur %d/%u\n",
4141 -+ bfqq->pid,
4142 -+ bfqq->entity.weight,
4143 -+ jiffies_to_msecs(jiffies -
4144 -+ bfqq->last_rais_start_finish),
4145 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4146 -+ }
4147 -+
4148 -+ spin_unlock_irq(bfqd->queue->queue_lock);
4149 -+
4150 -+ return num_char;
4151 -+}
4152 -+
4153 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4154 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4155 -+{ \
4156 -+ struct bfq_data *bfqd = e->elevator_data; \
4157 -+ unsigned int __data = __VAR; \
4158 -+ if (__CONV) \
4159 -+ __data = jiffies_to_msecs(__data); \
4160 -+ return bfq_var_show(__data, (page)); \
4161 -+}
4162 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4163 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4164 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4165 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4166 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4167 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4168 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4169 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4170 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4171 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4172 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4173 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4174 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4175 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4176 -+ 1);
4177 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4178 -+ bfqd->bfq_raising_min_inter_arr_async,
4179 -+ 1);
4180 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4181 -+ bfqd->bfq_raising_max_softrt_rate, 0);
4182 -+#undef SHOW_FUNCTION
4183 -+
4184 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4185 -+static ssize_t \
4186 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4187 -+{ \
4188 -+ struct bfq_data *bfqd = e->elevator_data; \
4189 -+ unsigned long uninitialized_var(__data); \
4190 -+ int ret = bfq_var_store(&__data, (page), count); \
4191 -+ if (__data < (MIN)) \
4192 -+ __data = (MIN); \
4193 -+ else if (__data > (MAX)) \
4194 -+ __data = (MAX); \
4195 -+ if (__CONV) \
4196 -+ *(__PTR) = msecs_to_jiffies(__data); \
4197 -+ else \
4198 -+ *(__PTR) = __data; \
4199 -+ return ret; \
4200 -+}
4201 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4202 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4203 -+ INT_MAX, 1);
4204 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4205 -+ INT_MAX, 1);
4206 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4207 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4208 -+ INT_MAX, 0);
4209 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4210 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4211 -+ 1, INT_MAX, 0);
4212 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4213 -+ INT_MAX, 1);
4214 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4215 -+ INT_MAX, 0);
4216 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4217 -+ INT_MAX, 1);
4218 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4219 -+ INT_MAX, 1);
4220 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
4221 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4222 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4223 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4224 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4225 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4226 -+#undef STORE_FUNCTION
4227 -+
4228 -+/* do nothing for the moment */
4229 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
4230 -+ const char *page, size_t count)
4231 -+{
4232 -+ return count;
4233 -+}
4234 -+
4235 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4236 -+{
4237 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4238 -+
4239 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4240 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4241 -+ else
4242 -+ return bfq_default_max_budget;
4243 -+}
4244 -+
4245 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4246 -+ const char *page, size_t count)
4247 -+{
4248 -+ struct bfq_data *bfqd = e->elevator_data;
4249 -+ unsigned long uninitialized_var(__data);
4250 -+ int ret = bfq_var_store(&__data, (page), count);
4251 -+
4252 -+ if (__data == 0)
4253 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4254 -+ else {
4255 -+ if (__data > INT_MAX)
4256 -+ __data = INT_MAX;
4257 -+ bfqd->bfq_max_budget = __data;
4258 -+ }
4259 -+
4260 -+ bfqd->bfq_user_max_budget = __data;
4261 -+
4262 -+ return ret;
4263 -+}
4264 -+
4265 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4266 -+ const char *page, size_t count)
4267 -+{
4268 -+ struct bfq_data *bfqd = e->elevator_data;
4269 -+ unsigned long uninitialized_var(__data);
4270 -+ int ret = bfq_var_store(&__data, (page), count);
4271 -+
4272 -+ if (__data < 1)
4273 -+ __data = 1;
4274 -+ else if (__data > INT_MAX)
4275 -+ __data = INT_MAX;
4276 -+
4277 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4278 -+ if (bfqd->bfq_user_max_budget == 0)
4279 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4280 -+
4281 -+ return ret;
4282 -+}
4283 -+
4284 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4285 -+ const char *page, size_t count)
4286 -+{
4287 -+ struct bfq_data *bfqd = e->elevator_data;
4288 -+ unsigned long uninitialized_var(__data);
4289 -+ int ret = bfq_var_store(&__data, (page), count);
4290 -+
4291 -+ if (__data > 1)
4292 -+ __data = 1;
4293 -+ if (__data == 0 && bfqd->low_latency != 0)
4294 -+ bfq_end_raising(bfqd);
4295 -+ bfqd->low_latency = __data;
4296 -+
4297 -+ return ret;
4298 -+}
4299 -+
4300 -+#define BFQ_ATTR(name) \
4301 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4302 -+
4303 -+static struct elv_fs_entry bfq_attrs[] = {
4304 -+ BFQ_ATTR(quantum),
4305 -+ BFQ_ATTR(fifo_expire_sync),
4306 -+ BFQ_ATTR(fifo_expire_async),
4307 -+ BFQ_ATTR(back_seek_max),
4308 -+ BFQ_ATTR(back_seek_penalty),
4309 -+ BFQ_ATTR(slice_idle),
4310 -+ BFQ_ATTR(max_budget),
4311 -+ BFQ_ATTR(max_budget_async_rq),
4312 -+ BFQ_ATTR(timeout_sync),
4313 -+ BFQ_ATTR(timeout_async),
4314 -+ BFQ_ATTR(low_latency),
4315 -+ BFQ_ATTR(raising_coeff),
4316 -+ BFQ_ATTR(raising_max_time),
4317 -+ BFQ_ATTR(raising_rt_max_time),
4318 -+ BFQ_ATTR(raising_min_idle_time),
4319 -+ BFQ_ATTR(raising_min_inter_arr_async),
4320 -+ BFQ_ATTR(raising_max_softrt_rate),
4321 -+ BFQ_ATTR(weights),
4322 -+ __ATTR_NULL
4323 -+};
4324 -+
4325 -+static struct elevator_type iosched_bfq = {
4326 -+ .ops = {
4327 -+ .elevator_merge_fn = bfq_merge,
4328 -+ .elevator_merged_fn = bfq_merged_request,
4329 -+ .elevator_merge_req_fn = bfq_merged_requests,
4330 -+ .elevator_allow_merge_fn = bfq_allow_merge,
4331 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
4332 -+ .elevator_add_req_fn = bfq_insert_request,
4333 -+ .elevator_activate_req_fn = bfq_activate_request,
4334 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
4335 -+ .elevator_completed_req_fn = bfq_completed_request,
4336 -+ .elevator_former_req_fn = elv_rb_former_request,
4337 -+ .elevator_latter_req_fn = elv_rb_latter_request,
4338 -+ .elevator_init_icq_fn = bfq_init_icq,
4339 -+ .elevator_exit_icq_fn = bfq_exit_icq,
4340 -+ .elevator_set_req_fn = bfq_set_request,
4341 -+ .elevator_put_req_fn = bfq_put_request,
4342 -+ .elevator_may_queue_fn = bfq_may_queue,
4343 -+ .elevator_init_fn = bfq_init_queue,
4344 -+ .elevator_exit_fn = bfq_exit_queue,
4345 -+ },
4346 -+ .icq_size = sizeof(struct bfq_io_cq),
4347 -+ .icq_align = __alignof__(struct bfq_io_cq),
4348 -+ .elevator_attrs = bfq_attrs,
4349 -+ .elevator_name = "bfq",
4350 -+ .elevator_owner = THIS_MODULE,
4351 -+};
4352 -+
4353 -+static int __init bfq_init(void)
4354 -+{
4355 -+ /*
4356 -+ * Can be 0 on HZ < 1000 setups.
4357 -+ */
4358 -+ if (bfq_slice_idle == 0)
4359 -+ bfq_slice_idle = 1;
4360 -+
4361 -+ if (bfq_timeout_async == 0)
4362 -+ bfq_timeout_async = 1;
4363 -+
4364 -+ if (bfq_slab_setup())
4365 -+ return -ENOMEM;
4366 -+
4367 -+ elv_register(&iosched_bfq);
4368 -+ pr_info("BFQ I/O-scheduler version: v7r2");
4369 -+
4370 -+ return 0;
4371 -+}
4372 -+
4373 -+static void __exit bfq_exit(void)
4374 -+{
4375 -+ elv_unregister(&iosched_bfq);
4376 -+ bfq_slab_kill();
4377 -+}
4378 -+
4379 -+module_init(bfq_init);
4380 -+module_exit(bfq_exit);
4381 -+
4382 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4383 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4384 -new file mode 100644
4385 -index 0000000..999b475
4386 ---- /dev/null
4387 -+++ b/block/bfq-sched.c
4388 -@@ -0,0 +1,1078 @@
4389 -+/*
4390 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
4391 -+ *
4392 -+ * Based on ideas and code from CFQ:
4393 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4394 -+ *
4395 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4396 -+ * Paolo Valente <paolo.valente@×××××××.it>
4397 -+ *
4398 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4399 -+ */
4400 -+
4401 -+#ifdef CONFIG_CGROUP_BFQIO
4402 -+#define for_each_entity(entity) \
4403 -+ for (; entity != NULL; entity = entity->parent)
4404 -+
4405 -+#define for_each_entity_safe(entity, parent) \
4406 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4407 -+
4408 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4409 -+ int extract,
4410 -+ struct bfq_data *bfqd);
4411 -+
4412 -+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
4413 -+{
4414 -+ struct bfq_entity *bfqg_entity;
4415 -+ struct bfq_group *bfqg;
4416 -+ struct bfq_sched_data *group_sd;
4417 -+
4418 -+ BUG_ON(next_in_service == NULL);
4419 -+
4420 -+ group_sd = next_in_service->sched_data;
4421 -+
4422 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4423 -+ /*
4424 -+ * bfq_group's my_entity field is not NULL only if the group
4425 -+ * is not the root group. We must not touch the root entity
4426 -+ * as it must never become an in-service entity.
4427 -+ */
4428 -+ bfqg_entity = bfqg->my_entity;
4429 -+ if (bfqg_entity != NULL)
4430 -+ bfqg_entity->budget = next_in_service->budget;
4431 -+}
4432 -+
4433 -+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
4434 -+{
4435 -+ struct bfq_entity *next_in_service;
4436 -+
4437 -+ if (sd->in_service_entity != NULL)
4438 -+ /* will update/requeue at the end of service */
4439 -+ return 0;
4440 -+
4441 -+ /*
4442 -+ * NOTE: this can be improved in many ways, such as returning
4443 -+ * 1 (and thus propagating upwards the update) only when the
4444 -+ * budget changes, or caching the bfqq that will be scheduled
4445 -+ * next from this subtree. By now we worry more about
4446 -+ * correctness than about performance...
4447 -+ */
4448 -+ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
4449 -+ sd->next_in_service = next_in_service;
4450 -+
4451 -+ if (next_in_service != NULL)
4452 -+ bfq_update_budget(next_in_service);
4453 -+
4454 -+ return 1;
4455 -+}
4456 -+
4457 -+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
4458 -+ struct bfq_entity *entity)
4459 -+{
4460 -+ BUG_ON(sd->next_in_service != entity);
4461 -+}
4462 -+#else
4463 -+#define for_each_entity(entity) \
4464 -+ for (; entity != NULL; entity = NULL)
4465 -+
4466 -+#define for_each_entity_safe(entity, parent) \
4467 -+ for (parent = NULL; entity != NULL; entity = parent)
4468 -+
4469 -+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
4470 -+{
4471 -+ return 0;
4472 -+}
4473 -+
4474 -+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
4475 -+ struct bfq_entity *entity)
4476 -+{
4477 -+}
4478 -+
4479 -+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
4480 -+{
4481 -+}
4482 -+#endif
4483 -+
4484 -+/*
4485 -+ * Shift for timestamp calculations. This actually limits the maximum
4486 -+ * service allowed in one timestamp delta (small shift values increase it),
4487 -+ * the maximum total weight that can be used for the queues in the system
4488 -+ * (big shift values increase it), and the period of virtual time wraparounds.
4489 -+ */
4490 -+#define WFQ_SERVICE_SHIFT 22
4491 -+
4492 -+/**
4493 -+ * bfq_gt - compare two timestamps.
4494 -+ * @a: first ts.
4495 -+ * @b: second ts.
4496 -+ *
4497 -+ * Return @a > @b, dealing with wrapping correctly.
4498 -+ */
4499 -+static inline int bfq_gt(u64 a, u64 b)
4500 -+{
4501 -+ return (s64)(a - b) > 0;
4502 -+}
4503 -+
4504 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4505 -+{
4506 -+ struct bfq_queue *bfqq = NULL;
4507 -+
4508 -+ BUG_ON(entity == NULL);
4509 -+
4510 -+ if (entity->my_sched_data == NULL)
4511 -+ bfqq = container_of(entity, struct bfq_queue, entity);
4512 -+
4513 -+ return bfqq;
4514 -+}
4515 -+
4516 -+
4517 -+/**
4518 -+ * bfq_delta - map service into the virtual time domain.
4519 -+ * @service: amount of service.
4520 -+ * @weight: scale factor (weight of an entity or weight sum).
4521 -+ */
4522 -+static inline u64 bfq_delta(unsigned long service,
4523 -+ unsigned long weight)
4524 -+{
4525 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4526 -+
4527 -+ do_div(d, weight);
4528 -+ return d;
4529 -+}
4530 -+
4531 -+/**
4532 -+ * bfq_calc_finish - assign the finish time to an entity.
4533 -+ * @entity: the entity to act upon.
4534 -+ * @service: the service to be charged to the entity.
4535 -+ */
4536 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
4537 -+ unsigned long service)
4538 -+{
4539 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4540 -+
4541 -+ BUG_ON(entity->weight == 0);
4542 -+
4543 -+ entity->finish = entity->start +
4544 -+ bfq_delta(service, entity->weight);
4545 -+
4546 -+ if (bfqq != NULL) {
4547 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4548 -+ "calc_finish: serv %lu, w %d",
4549 -+ service, entity->weight);
4550 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4551 -+ "calc_finish: start %llu, finish %llu, delta %llu",
4552 -+ entity->start, entity->finish,
4553 -+ bfq_delta(service, entity->weight));
4554 -+ }
4555 -+}
4556 -+
4557 -+/**
4558 -+ * bfq_entity_of - get an entity from a node.
4559 -+ * @node: the node field of the entity.
4560 -+ *
4561 -+ * Convert a node pointer to the relative entity. This is used only
4562 -+ * to simplify the logic of some functions and not as the generic
4563 -+ * conversion mechanism because, e.g., in the tree walking functions,
4564 -+ * the check for a %NULL value would be redundant.
4565 -+ */
4566 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4567 -+{
4568 -+ struct bfq_entity *entity = NULL;
4569 -+
4570 -+ if (node != NULL)
4571 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
4572 -+
4573 -+ return entity;
4574 -+}
4575 -+
4576 -+/**
4577 -+ * bfq_extract - remove an entity from a tree.
4578 -+ * @root: the tree root.
4579 -+ * @entity: the entity to remove.
4580 -+ */
4581 -+static inline void bfq_extract(struct rb_root *root,
4582 -+ struct bfq_entity *entity)
4583 -+{
4584 -+ BUG_ON(entity->tree != root);
4585 -+
4586 -+ entity->tree = NULL;
4587 -+ rb_erase(&entity->rb_node, root);
4588 -+}
4589 -+
4590 -+/**
4591 -+ * bfq_idle_extract - extract an entity from the idle tree.
4592 -+ * @st: the service tree of the owning @entity.
4593 -+ * @entity: the entity being removed.
4594 -+ */
4595 -+static void bfq_idle_extract(struct bfq_service_tree *st,
4596 -+ struct bfq_entity *entity)
4597 -+{
4598 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4599 -+ struct rb_node *next;
4600 -+
4601 -+ BUG_ON(entity->tree != &st->idle);
4602 -+
4603 -+ if (entity == st->first_idle) {
4604 -+ next = rb_next(&entity->rb_node);
4605 -+ st->first_idle = bfq_entity_of(next);
4606 -+ }
4607 -+
4608 -+ if (entity == st->last_idle) {
4609 -+ next = rb_prev(&entity->rb_node);
4610 -+ st->last_idle = bfq_entity_of(next);
4611 -+ }
4612 -+
4613 -+ bfq_extract(&st->idle, entity);
4614 -+
4615 -+ if (bfqq != NULL)
4616 -+ list_del(&bfqq->bfqq_list);
4617 -+}
4618 -+
4619 -+/**
4620 -+ * bfq_insert - generic tree insertion.
4621 -+ * @root: tree root.
4622 -+ * @entity: entity to insert.
4623 -+ *
4624 -+ * This is used for the idle and the active tree, since they are both
4625 -+ * ordered by finish time.
4626 -+ */
4627 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4628 -+{
4629 -+ struct bfq_entity *entry;
4630 -+ struct rb_node **node = &root->rb_node;
4631 -+ struct rb_node *parent = NULL;
4632 -+
4633 -+ BUG_ON(entity->tree != NULL);
4634 -+
4635 -+ while (*node != NULL) {
4636 -+ parent = *node;
4637 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
4638 -+
4639 -+ if (bfq_gt(entry->finish, entity->finish))
4640 -+ node = &parent->rb_left;
4641 -+ else
4642 -+ node = &parent->rb_right;
4643 -+ }
4644 -+
4645 -+ rb_link_node(&entity->rb_node, parent, node);
4646 -+ rb_insert_color(&entity->rb_node, root);
4647 -+
4648 -+ entity->tree = root;
4649 -+}
4650 -+
4651 -+/**
4652 -+ * bfq_update_min - update the min_start field of a entity.
4653 -+ * @entity: the entity to update.
4654 -+ * @node: one of its children.
4655 -+ *
4656 -+ * This function is called when @entity may store an invalid value for
4657 -+ * min_start due to updates to the active tree. The function assumes
4658 -+ * that the subtree rooted at @node (which may be its left or its right
4659 -+ * child) has a valid min_start value.
4660 -+ */
4661 -+static inline void bfq_update_min(struct bfq_entity *entity,
4662 -+ struct rb_node *node)
4663 -+{
4664 -+ struct bfq_entity *child;
4665 -+
4666 -+ if (node != NULL) {
4667 -+ child = rb_entry(node, struct bfq_entity, rb_node);
4668 -+ if (bfq_gt(entity->min_start, child->min_start))
4669 -+ entity->min_start = child->min_start;
4670 -+ }
4671 -+}
4672 -+
4673 -+/**
4674 -+ * bfq_update_active_node - recalculate min_start.
4675 -+ * @node: the node to update.
4676 -+ *
4677 -+ * @node may have changed position or one of its children may have moved,
4678 -+ * this function updates its min_start value. The left and right subtrees
4679 -+ * are assumed to hold a correct min_start value.
4680 -+ */
4681 -+static inline void bfq_update_active_node(struct rb_node *node)
4682 -+{
4683 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4684 -+
4685 -+ entity->min_start = entity->start;
4686 -+ bfq_update_min(entity, node->rb_right);
4687 -+ bfq_update_min(entity, node->rb_left);
4688 -+}
4689 -+
4690 -+/**
4691 -+ * bfq_update_active_tree - update min_start for the whole active tree.
4692 -+ * @node: the starting node.
4693 -+ *
4694 -+ * @node must be the deepest modified node after an update. This function
4695 -+ * updates its min_start using the values held by its children, assuming
4696 -+ * that they did not change, and then updates all the nodes that may have
4697 -+ * changed in the path to the root. The only nodes that may have changed
4698 -+ * are the ones in the path or their siblings.
4699 -+ */
4700 -+static void bfq_update_active_tree(struct rb_node *node)
4701 -+{
4702 -+ struct rb_node *parent;
4703 -+
4704 -+up:
4705 -+ bfq_update_active_node(node);
4706 -+
4707 -+ parent = rb_parent(node);
4708 -+ if (parent == NULL)
4709 -+ return;
4710 -+
4711 -+ if (node == parent->rb_left && parent->rb_right != NULL)
4712 -+ bfq_update_active_node(parent->rb_right);
4713 -+ else if (parent->rb_left != NULL)
4714 -+ bfq_update_active_node(parent->rb_left);
4715 -+
4716 -+ node = parent;
4717 -+ goto up;
4718 -+}
4719 -+
4720 -+/**
4721 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
4722 -+ * @st: the service tree of the entity.
4723 -+ * @entity: the entity being inserted.
4724 -+ *
4725 -+ * The active tree is ordered by finish time, but an extra key is kept
4726 -+ * per each node, containing the minimum value for the start times of
4727 -+ * its children (and the node itself), so it's possible to search for
4728 -+ * the eligible node with the lowest finish time in logarithmic time.
4729 -+ */
4730 -+static void bfq_active_insert(struct bfq_service_tree *st,
4731 -+ struct bfq_entity *entity)
4732 -+{
4733 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4734 -+ struct rb_node *node = &entity->rb_node;
4735 -+
4736 -+ bfq_insert(&st->active, entity);
4737 -+
4738 -+ if (node->rb_left != NULL)
4739 -+ node = node->rb_left;
4740 -+ else if (node->rb_right != NULL)
4741 -+ node = node->rb_right;
4742 -+
4743 -+ bfq_update_active_tree(node);
4744 -+
4745 -+ if (bfqq != NULL)
4746 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4747 -+}
4748 -+
4749 -+/**
4750 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4751 -+ * @ioprio: the ioprio value to convert.
4752 -+ */
4753 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
4754 -+{
4755 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4756 -+ return IOPRIO_BE_NR - ioprio;
4757 -+}
4758 -+
4759 -+/**
4760 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4761 -+ * @weight: the weight value to convert.
4762 -+ *
4763 -+ * To preserve as mush as possible the old only-ioprio user interface,
4764 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
4765 -+ * larger than IOPRIO_BE_NR
4766 -+ */
4767 -+static unsigned short bfq_weight_to_ioprio(int weight)
4768 -+{
4769 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4770 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4771 -+}
4772 -+
4773 -+static inline void bfq_get_entity(struct bfq_entity *entity)
4774 -+{
4775 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4776 -+ struct bfq_sched_data *sd;
4777 -+
4778 -+ if (bfqq != NULL) {
4779 -+ sd = entity->sched_data;
4780 -+ atomic_inc(&bfqq->ref);
4781 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4782 -+ bfqq, atomic_read(&bfqq->ref));
4783 -+ }
4784 -+}
4785 -+
4786 -+/**
4787 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
4788 -+ * @node: the node being removed.
4789 -+ *
4790 -+ * Do the first step of an extraction in an rb tree, looking for the
4791 -+ * node that will replace @node, and returning the deepest node that
4792 -+ * the following modifications to the tree can touch. If @node is the
4793 -+ * last node in the tree return %NULL.
4794 -+ */
4795 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
4796 -+{
4797 -+ struct rb_node *deepest;
4798 -+
4799 -+ if (node->rb_right == NULL && node->rb_left == NULL)
4800 -+ deepest = rb_parent(node);
4801 -+ else if (node->rb_right == NULL)
4802 -+ deepest = node->rb_left;
4803 -+ else if (node->rb_left == NULL)
4804 -+ deepest = node->rb_right;
4805 -+ else {
4806 -+ deepest = rb_next(node);
4807 -+ if (deepest->rb_right != NULL)
4808 -+ deepest = deepest->rb_right;
4809 -+ else if (rb_parent(deepest) != node)
4810 -+ deepest = rb_parent(deepest);
4811 -+ }
4812 -+
4813 -+ return deepest;
4814 -+}
4815 -+
4816 -+/**
4817 -+ * bfq_active_extract - remove an entity from the active tree.
4818 -+ * @st: the service_tree containing the tree.
4819 -+ * @entity: the entity being removed.
4820 -+ */
4821 -+static void bfq_active_extract(struct bfq_service_tree *st,
4822 -+ struct bfq_entity *entity)
4823 -+{
4824 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4825 -+ struct rb_node *node;
4826 -+
4827 -+ node = bfq_find_deepest(&entity->rb_node);
4828 -+ bfq_extract(&st->active, entity);
4829 -+
4830 -+ if (node != NULL)
4831 -+ bfq_update_active_tree(node);
4832 -+
4833 -+ if (bfqq != NULL)
4834 -+ list_del(&bfqq->bfqq_list);
4835 -+}
4836 -+
4837 -+/**
4838 -+ * bfq_idle_insert - insert an entity into the idle tree.
4839 -+ * @st: the service tree containing the tree.
4840 -+ * @entity: the entity to insert.
4841 -+ */
4842 -+static void bfq_idle_insert(struct bfq_service_tree *st,
4843 -+ struct bfq_entity *entity)
4844 -+{
4845 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4846 -+ struct bfq_entity *first_idle = st->first_idle;
4847 -+ struct bfq_entity *last_idle = st->last_idle;
4848 -+
4849 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
4850 -+ st->first_idle = entity;
4851 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
4852 -+ st->last_idle = entity;
4853 -+
4854 -+ bfq_insert(&st->idle, entity);
4855 -+
4856 -+ if (bfqq != NULL)
4857 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
4858 -+}
4859 -+
4860 -+/**
4861 -+ * bfq_forget_entity - remove an entity from the wfq trees.
4862 -+ * @st: the service tree.
4863 -+ * @entity: the entity being removed.
4864 -+ *
4865 -+ * Update the device status and forget everything about @entity, putting
4866 -+ * the device reference to it, if it is a queue. Entities belonging to
4867 -+ * groups are not refcounted.
4868 -+ */
4869 -+static void bfq_forget_entity(struct bfq_service_tree *st,
4870 -+ struct bfq_entity *entity)
4871 -+{
4872 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4873 -+ struct bfq_sched_data *sd;
4874 -+
4875 -+ BUG_ON(!entity->on_st);
4876 -+
4877 -+ entity->on_st = 0;
4878 -+ st->wsum -= entity->weight;
4879 -+ if (bfqq != NULL) {
4880 -+ sd = entity->sched_data;
4881 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
4882 -+ bfqq, atomic_read(&bfqq->ref));
4883 -+ bfq_put_queue(bfqq);
4884 -+ }
4885 -+}
4886 -+
4887 -+/**
4888 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
4889 -+ * @st: service tree for the entity.
4890 -+ * @entity: the entity being released.
4891 -+ */
4892 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
4893 -+ struct bfq_entity *entity)
4894 -+{
4895 -+ bfq_idle_extract(st, entity);
4896 -+ bfq_forget_entity(st, entity);
4897 -+}
4898 -+
4899 -+/**
4900 -+ * bfq_forget_idle - update the idle tree if necessary.
4901 -+ * @st: the service tree to act upon.
4902 -+ *
4903 -+ * To preserve the global O(log N) complexity we only remove one entry here;
4904 -+ * as the idle tree will not grow indefinitely this can be done safely.
4905 -+ */
4906 -+static void bfq_forget_idle(struct bfq_service_tree *st)
4907 -+{
4908 -+ struct bfq_entity *first_idle = st->first_idle;
4909 -+ struct bfq_entity *last_idle = st->last_idle;
4910 -+
4911 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
4912 -+ !bfq_gt(last_idle->finish, st->vtime)) {
4913 -+ /*
4914 -+ * Forget the whole idle tree, increasing the vtime past
4915 -+ * the last finish time of idle entities.
4916 -+ */
4917 -+ st->vtime = last_idle->finish;
4918 -+ }
4919 -+
4920 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
4921 -+ bfq_put_idle_entity(st, first_idle);
4922 -+}
4923 -+
4924 -+static struct bfq_service_tree *
4925 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
4926 -+ struct bfq_entity *entity)
4927 -+{
4928 -+ struct bfq_service_tree *new_st = old_st;
4929 -+
4930 -+ if (entity->ioprio_changed) {
4931 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4932 -+
4933 -+ BUG_ON(old_st->wsum < entity->weight);
4934 -+ old_st->wsum -= entity->weight;
4935 -+
4936 -+ if (entity->new_weight != entity->orig_weight) {
4937 -+ entity->orig_weight = entity->new_weight;
4938 -+ entity->ioprio =
4939 -+ bfq_weight_to_ioprio(entity->orig_weight);
4940 -+ } else if (entity->new_ioprio != entity->ioprio) {
4941 -+ entity->ioprio = entity->new_ioprio;
4942 -+ entity->orig_weight =
4943 -+ bfq_ioprio_to_weight(entity->ioprio);
4944 -+ } else
4945 -+ entity->new_weight = entity->orig_weight =
4946 -+ bfq_ioprio_to_weight(entity->ioprio);
4947 -+
4948 -+ entity->ioprio_class = entity->new_ioprio_class;
4949 -+ entity->ioprio_changed = 0;
4950 -+
4951 -+ /*
4952 -+ * NOTE: here we may be changing the weight too early,
4953 -+ * this will cause unfairness. The correct approach
4954 -+ * would have required additional complexity to defer
4955 -+ * weight changes to the proper time instants (i.e.,
4956 -+ * when entity->finish <= old_st->vtime).
4957 -+ */
4958 -+ new_st = bfq_entity_service_tree(entity);
4959 -+ entity->weight = entity->orig_weight *
4960 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
4961 -+ new_st->wsum += entity->weight;
4962 -+
4963 -+ if (new_st != old_st)
4964 -+ entity->start = new_st->vtime;
4965 -+ }
4966 -+
4967 -+ return new_st;
4968 -+}
4969 -+
4970 -+/**
4971 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
4972 -+ * @bfqq: the queue being served.
4973 -+ * @served: bytes to transfer.
4974 -+ *
4975 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
4976 -+ * are synchronized every time a new bfqq is selected for service. By now,
4977 -+ * we keep it to better check consistency.
4978 -+ */
4979 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
4980 -+{
4981 -+ struct bfq_entity *entity = &bfqq->entity;
4982 -+ struct bfq_service_tree *st;
4983 -+
4984 -+ for_each_entity(entity) {
4985 -+ st = bfq_entity_service_tree(entity);
4986 -+
4987 -+ entity->service += served;
4988 -+ BUG_ON(entity->service > entity->budget);
4989 -+ BUG_ON(st->wsum == 0);
4990 -+
4991 -+ st->vtime += bfq_delta(served, st->wsum);
4992 -+ bfq_forget_idle(st);
4993 -+ }
4994 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
4995 -+}
4996 -+
4997 -+/**
4998 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
4999 -+ * @bfqq: the queue that needs a service update.
5000 -+ *
5001 -+ * When it's not possible to be fair in the service domain, because
5002 -+ * a queue is not consuming its budget fast enough (the meaning of
5003 -+ * fast depends on the timeout parameter), we charge it a full
5004 -+ * budget. In this way we should obtain a sort of time-domain
5005 -+ * fairness among all the seeky/slow queues.
5006 -+ */
5007 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
5008 -+{
5009 -+ struct bfq_entity *entity = &bfqq->entity;
5010 -+
5011 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
5012 -+
5013 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
5014 -+}
5015 -+
5016 -+/**
5017 -+ * __bfq_activate_entity - activate an entity.
5018 -+ * @entity: the entity being activated.
5019 -+ *
5020 -+ * Called whenever an entity is activated, i.e., it is not active and one
5021 -+ * of its children receives a new request, or has to be reactivated due to
5022 -+ * budget exhaustion. It uses the current budget of the entity (and the
5023 -+ * service received if @entity is active) of the queue to calculate its
5024 -+ * timestamps.
5025 -+ */
5026 -+static void __bfq_activate_entity(struct bfq_entity *entity)
5027 -+{
5028 -+ struct bfq_sched_data *sd = entity->sched_data;
5029 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5030 -+
5031 -+ if (entity == sd->in_service_entity) {
5032 -+ BUG_ON(entity->tree != NULL);
5033 -+ /*
5034 -+ * If we are requeueing the current entity we have
5035 -+ * to take care of not charging to it service it has
5036 -+ * not received.
5037 -+ */
5038 -+ bfq_calc_finish(entity, entity->service);
5039 -+ entity->start = entity->finish;
5040 -+ sd->in_service_entity = NULL;
5041 -+ } else if (entity->tree == &st->active) {
5042 -+ /*
5043 -+ * Requeueing an entity due to a change of some
5044 -+ * next_in_service entity below it. We reuse the
5045 -+ * old start time.
5046 -+ */
5047 -+ bfq_active_extract(st, entity);
5048 -+ } else if (entity->tree == &st->idle) {
5049 -+ /*
5050 -+ * Must be on the idle tree, bfq_idle_extract() will
5051 -+ * check for that.
5052 -+ */
5053 -+ bfq_idle_extract(st, entity);
5054 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
5055 -+ st->vtime : entity->finish;
5056 -+ } else {
5057 -+ /*
5058 -+ * The finish time of the entity may be invalid, and
5059 -+ * it is in the past for sure, otherwise the queue
5060 -+ * would have been on the idle tree.
5061 -+ */
5062 -+ entity->start = st->vtime;
5063 -+ st->wsum += entity->weight;
5064 -+ bfq_get_entity(entity);
5065 -+
5066 -+ BUG_ON(entity->on_st);
5067 -+ entity->on_st = 1;
5068 -+ }
5069 -+
5070 -+ st = __bfq_entity_update_weight_prio(st, entity);
5071 -+ bfq_calc_finish(entity, entity->budget);
5072 -+ bfq_active_insert(st, entity);
5073 -+}
5074 -+
5075 -+/**
5076 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5077 -+ * @entity: the entity to activate.
5078 -+ *
5079 -+ * Activate @entity and all the entities on the path from it to the root.
5080 -+ */
5081 -+static void bfq_activate_entity(struct bfq_entity *entity)
5082 -+{
5083 -+ struct bfq_sched_data *sd;
5084 -+
5085 -+ for_each_entity(entity) {
5086 -+ __bfq_activate_entity(entity);
5087 -+
5088 -+ sd = entity->sched_data;
5089 -+ if (!bfq_update_next_in_service(sd))
5090 -+ /*
5091 -+ * No need to propagate the activation to the
5092 -+ * upper entities, as they will be updated when
5093 -+ * the in-service entity is rescheduled.
5094 -+ */
5095 -+ break;
5096 -+ }
5097 -+}
5098 -+
5099 -+/**
5100 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5101 -+ * @entity: the entity to deactivate.
5102 -+ * @requeue: if false, the entity will not be put into the idle tree.
5103 -+ *
5104 -+ * Deactivate an entity, independently from its previous state. If the
5105 -+ * entity was not on a service tree just return, otherwise if it is on
5106 -+ * any scheduler tree, extract it from that tree, and if necessary
5107 -+ * and if the caller did not specify @requeue, put it on the idle tree.
5108 -+ *
5109 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
5110 -+ * if the entity was under service or if it was the next_in_service for
5111 -+ * its sched_data; return %0 otherwise.
5112 -+ */
5113 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5114 -+{
5115 -+ struct bfq_sched_data *sd = entity->sched_data;
5116 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5117 -+ int was_in_service = entity == sd->in_service_entity;
5118 -+ int ret = 0;
5119 -+
5120 -+ if (!entity->on_st)
5121 -+ return 0;
5122 -+
5123 -+ BUG_ON(was_in_service && entity->tree != NULL);
5124 -+
5125 -+ if (was_in_service) {
5126 -+ bfq_calc_finish(entity, entity->service);
5127 -+ sd->in_service_entity = NULL;
5128 -+ } else if (entity->tree == &st->active)
5129 -+ bfq_active_extract(st, entity);
5130 -+ else if (entity->tree == &st->idle)
5131 -+ bfq_idle_extract(st, entity);
5132 -+ else if (entity->tree != NULL)
5133 -+ BUG();
5134 -+
5135 -+ if (was_in_service || sd->next_in_service == entity)
5136 -+ ret = bfq_update_next_in_service(sd);
5137 -+
5138 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5139 -+ bfq_forget_entity(st, entity);
5140 -+ else
5141 -+ bfq_idle_insert(st, entity);
5142 -+
5143 -+ BUG_ON(sd->in_service_entity == entity);
5144 -+ BUG_ON(sd->next_in_service == entity);
5145 -+
5146 -+ return ret;
5147 -+}
5148 -+
5149 -+/**
5150 -+ * bfq_deactivate_entity - deactivate an entity.
5151 -+ * @entity: the entity to deactivate.
5152 -+ * @requeue: true if the entity can be put on the idle tree
5153 -+ */
5154 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5155 -+{
5156 -+ struct bfq_sched_data *sd;
5157 -+ struct bfq_entity *parent;
5158 -+
5159 -+ for_each_entity_safe(entity, parent) {
5160 -+ sd = entity->sched_data;
5161 -+
5162 -+ if (!__bfq_deactivate_entity(entity, requeue))
5163 -+ /*
5164 -+ * The parent entity is still backlogged, and
5165 -+ * we don't need to update it as it is still
5166 -+ * under service.
5167 -+ */
5168 -+ break;
5169 -+
5170 -+ if (sd->next_in_service != NULL)
5171 -+ /*
5172 -+ * The parent entity is still backlogged and
5173 -+ * the budgets on the path towards the root
5174 -+ * need to be updated.
5175 -+ */
5176 -+ goto update;
5177 -+
5178 -+ /*
5179 -+ * If we reach there the parent is no more backlogged and
5180 -+ * we want to propagate the dequeue upwards.
5181 -+ */
5182 -+ requeue = 1;
5183 -+ }
5184 -+
5185 -+ return;
5186 -+
5187 -+update:
5188 -+ entity = parent;
5189 -+ for_each_entity(entity) {
5190 -+ __bfq_activate_entity(entity);
5191 -+
5192 -+ sd = entity->sched_data;
5193 -+ if (!bfq_update_next_in_service(sd))
5194 -+ break;
5195 -+ }
5196 -+}
5197 -+
5198 -+/**
5199 -+ * bfq_update_vtime - update vtime if necessary.
5200 -+ * @st: the service tree to act upon.
5201 -+ *
5202 -+ * If necessary update the service tree vtime to have at least one
5203 -+ * eligible entity, skipping to its start time. Assumes that the
5204 -+ * active tree of the device is not empty.
5205 -+ *
5206 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
5207 -+ * we may end up with reactivated tasks getting timestamps after a
5208 -+ * vtime skip done because we needed a ->first_active entity on some
5209 -+ * intermediate node.
5210 -+ */
5211 -+static void bfq_update_vtime(struct bfq_service_tree *st)
5212 -+{
5213 -+ struct bfq_entity *entry;
5214 -+ struct rb_node *node = st->active.rb_node;
5215 -+
5216 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5217 -+ if (bfq_gt(entry->min_start, st->vtime)) {
5218 -+ st->vtime = entry->min_start;
5219 -+ bfq_forget_idle(st);
5220 -+ }
5221 -+}
5222 -+
5223 -+/**
5224 -+ * bfq_first_active_entity - find the eligible entity with
5225 -+ * the smallest finish time
5226 -+ * @st: the service tree to select from.
5227 -+ *
5228 -+ * This function searches the first schedulable entity, starting from the
5229 -+ * root of the tree and going on the left every time on this side there is
5230 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
5231 -+ * on the right is followed only if a) the left subtree contains no eligible
5232 -+ * entities and b) no eligible entity has been found yet.
5233 -+ */
5234 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5235 -+{
5236 -+ struct bfq_entity *entry, *first = NULL;
5237 -+ struct rb_node *node = st->active.rb_node;
5238 -+
5239 -+ while (node != NULL) {
5240 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5241 -+left:
5242 -+ if (!bfq_gt(entry->start, st->vtime))
5243 -+ first = entry;
5244 -+
5245 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5246 -+
5247 -+ if (node->rb_left != NULL) {
5248 -+ entry = rb_entry(node->rb_left,
5249 -+ struct bfq_entity, rb_node);
5250 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
5251 -+ node = node->rb_left;
5252 -+ goto left;
5253 -+ }
5254 -+ }
5255 -+ if (first != NULL)
5256 -+ break;
5257 -+ node = node->rb_right;
5258 -+ }
5259 -+
5260 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5261 -+ return first;
5262 -+}
5263 -+
5264 -+/**
5265 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5266 -+ * @st: the service tree.
5267 -+ *
5268 -+ * Update the virtual time in @st and return the first eligible entity
5269 -+ * it contains.
5270 -+ */
5271 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5272 -+ bool force)
5273 -+{
5274 -+ struct bfq_entity *entity, *new_next_in_service = NULL;
5275 -+
5276 -+ if (RB_EMPTY_ROOT(&st->active))
5277 -+ return NULL;
5278 -+
5279 -+ bfq_update_vtime(st);
5280 -+ entity = bfq_first_active_entity(st);
5281 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
5282 -+
5283 -+ /*
5284 -+ * If the chosen entity does not match with the sched_data's
5285 -+ * next_in_service and we are forcedly serving the IDLE priority
5286 -+ * class tree, bubble up budget update.
5287 -+ */
5288 -+ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
5289 -+ new_next_in_service = entity;
5290 -+ for_each_entity(new_next_in_service)
5291 -+ bfq_update_budget(new_next_in_service);
5292 -+ }
5293 -+
5294 -+ return entity;
5295 -+}
5296 -+
5297 -+/**
5298 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5299 -+ * @sd: the sched_data.
5300 -+ * @extract: if true the returned entity will be also extracted from @sd.
5301 -+ *
5302 -+ * NOTE: since we cache the next_in_service entity at each level of the
5303 -+ * hierarchy, the complexity of the lookup can be decreased with
5304 -+ * absolutely no effort just returning the cached next_in_service value;
5305 -+ * we prefer to do full lookups to test the consistency of * the data
5306 -+ * structures.
5307 -+ */
5308 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5309 -+ int extract,
5310 -+ struct bfq_data *bfqd)
5311 -+{
5312 -+ struct bfq_service_tree *st = sd->service_tree;
5313 -+ struct bfq_entity *entity;
5314 -+ int i = 0;
5315 -+
5316 -+ BUG_ON(sd->in_service_entity != NULL);
5317 -+
5318 -+ if (bfqd != NULL &&
5319 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5320 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
5321 -+ true);
5322 -+ if (entity != NULL) {
5323 -+ i = BFQ_IOPRIO_CLASSES - 1;
5324 -+ bfqd->bfq_class_idle_last_service = jiffies;
5325 -+ sd->next_in_service = entity;
5326 -+ }
5327 -+ }
5328 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5329 -+ entity = __bfq_lookup_next_entity(st + i, false);
5330 -+ if (entity != NULL) {
5331 -+ if (extract) {
5332 -+ bfq_check_next_in_service(sd, entity);
5333 -+ bfq_active_extract(st + i, entity);
5334 -+ sd->in_service_entity = entity;
5335 -+ sd->next_in_service = NULL;
5336 -+ }
5337 -+ break;
5338 -+ }
5339 -+ }
5340 -+
5341 -+ return entity;
5342 -+}
5343 -+
5344 -+/*
5345 -+ * Get next queue for service.
5346 -+ */
5347 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5348 -+{
5349 -+ struct bfq_entity *entity = NULL;
5350 -+ struct bfq_sched_data *sd;
5351 -+ struct bfq_queue *bfqq;
5352 -+
5353 -+ BUG_ON(bfqd->in_service_queue != NULL);
5354 -+
5355 -+ if (bfqd->busy_queues == 0)
5356 -+ return NULL;
5357 -+
5358 -+ sd = &bfqd->root_group->sched_data;
5359 -+ for (; sd != NULL; sd = entity->my_sched_data) {
5360 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5361 -+ BUG_ON(entity == NULL);
5362 -+ entity->service = 0;
5363 -+ }
5364 -+
5365 -+ bfqq = bfq_entity_to_bfqq(entity);
5366 -+ BUG_ON(bfqq == NULL);
5367 -+
5368 -+ return bfqq;
5369 -+}
5370 -+
5371 -+/*
5372 -+ * Forced extraction of the given queue.
5373 -+ */
5374 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5375 -+ struct bfq_queue *bfqq)
5376 -+{
5377 -+ struct bfq_entity *entity;
5378 -+ struct bfq_sched_data *sd;
5379 -+
5380 -+ BUG_ON(bfqd->in_service_queue != NULL);
5381 -+
5382 -+ entity = &bfqq->entity;
5383 -+ /*
5384 -+ * Bubble up extraction/update from the leaf to the root.
5385 -+ */
5386 -+ for_each_entity(entity) {
5387 -+ sd = entity->sched_data;
5388 -+ bfq_update_budget(entity);
5389 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
5390 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5391 -+ sd->active_entity = entity;
5392 -+ sd->next_active = NULL;
5393 -+ entity->service = 0;
5394 -+ }
5395 -+
5396 -+ return;
5397 -+}
5398 -+
5399 -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
5400 -+{
5401 -+ if (bfqd->in_service_bic != NULL) {
5402 -+ put_io_context(bfqd->in_service_bic->icq.ioc);
5403 -+ bfqd->in_service_bic = NULL;
5404 -+ }
5405 -+
5406 -+ bfqd->in_service_queue = NULL;
5407 -+ del_timer(&bfqd->idle_slice_timer);
5408 -+}
5409 -+
5410 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5411 -+ int requeue)
5412 -+{
5413 -+ struct bfq_entity *entity = &bfqq->entity;
5414 -+
5415 -+ if (bfqq == bfqd->in_service_queue)
5416 -+ __bfq_bfqd_reset_in_service(bfqd);
5417 -+
5418 -+ bfq_deactivate_entity(entity, requeue);
5419 -+}
5420 -+
5421 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5422 -+{
5423 -+ struct bfq_entity *entity = &bfqq->entity;
5424 -+
5425 -+ bfq_activate_entity(entity);
5426 -+}
5427 -+
5428 -+/*
5429 -+ * Called when the bfqq no longer has requests pending, remove it from
5430 -+ * the service tree.
5431 -+ */
5432 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5433 -+ int requeue)
5434 -+{
5435 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
5436 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5437 -+
5438 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5439 -+
5440 -+ bfq_clear_bfqq_busy(bfqq);
5441 -+
5442 -+ BUG_ON(bfqd->busy_queues == 0);
5443 -+ bfqd->busy_queues--;
5444 -+ if (bfqq->raising_coeff > 1)
5445 -+ bfqd->raised_busy_queues--;
5446 -+
5447 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5448 -+}
5449 -+
5450 -+/*
5451 -+ * Called when an inactive queue receives a new request.
5452 -+ */
5453 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5454 -+{
5455 -+ BUG_ON(bfq_bfqq_busy(bfqq));
5456 -+ BUG_ON(bfqq == bfqd->in_service_queue);
5457 -+
5458 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5459 -+
5460 -+ bfq_activate_bfqq(bfqd, bfqq);
5461 -+
5462 -+ bfq_mark_bfqq_busy(bfqq);
5463 -+ bfqd->busy_queues++;
5464 -+ if (bfqq->raising_coeff > 1)
5465 -+ bfqd->raised_busy_queues++;
5466 -+}
5467 -diff --git a/block/bfq.h b/block/bfq.h
5468 -new file mode 100644
5469 -index 0000000..3ca8482
5470 ---- /dev/null
5471 -+++ b/block/bfq.h
5472 -@@ -0,0 +1,622 @@
5473 -+/*
5474 -+ * BFQ-v7r2 for 3.14.0: data structures and common functions prototypes.
5475 -+ *
5476 -+ * Based on ideas and code from CFQ:
5477 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5478 -+ *
5479 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5480 -+ * Paolo Valente <paolo.valente@×××××××.it>
5481 -+ *
5482 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5483 -+ */
5484 -+
5485 -+#ifndef _BFQ_H
5486 -+#define _BFQ_H
5487 -+
5488 -+#include <linux/blktrace_api.h>
5489 -+#include <linux/hrtimer.h>
5490 -+#include <linux/ioprio.h>
5491 -+#include <linux/rbtree.h>
5492 -+
5493 -+#define BFQ_IOPRIO_CLASSES 3
5494 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
5495 -+
5496 -+#define BFQ_MIN_WEIGHT 1
5497 -+#define BFQ_MAX_WEIGHT 1000
5498 -+
5499 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
5500 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
5501 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5502 -+
5503 -+struct bfq_entity;
5504 -+
5505 -+/**
5506 -+ * struct bfq_service_tree - per ioprio_class service tree.
5507 -+ * @active: tree for active entities (i.e., those backlogged).
5508 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5509 -+ * @first_idle: idle entity with minimum F_i.
5510 -+ * @last_idle: idle entity with maximum F_i.
5511 -+ * @vtime: scheduler virtual time.
5512 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5513 -+ *
5514 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5515 -+ * ioprio_class has its own independent scheduler, and so its own
5516 -+ * bfq_service_tree. All the fields are protected by the queue lock
5517 -+ * of the containing bfqd.
5518 -+ */
5519 -+struct bfq_service_tree {
5520 -+ struct rb_root active;
5521 -+ struct rb_root idle;
5522 -+
5523 -+ struct bfq_entity *first_idle;
5524 -+ struct bfq_entity *last_idle;
5525 -+
5526 -+ u64 vtime;
5527 -+ unsigned long wsum;
5528 -+};
5529 -+
5530 -+/**
5531 -+ * struct bfq_sched_data - multi-class scheduler.
5532 -+ * @in_service_entity: entity under service.
5533 -+ * @next_in_service: head-of-the-line entity in the scheduler.
5534 -+ * @service_tree: array of service trees, one per ioprio_class.
5535 -+ *
5536 -+ * bfq_sched_data is the basic scheduler queue. It supports three
5537 -+ * ioprio_classes, and can be used either as a toplevel queue or as
5538 -+ * an intermediate queue on a hierarchical setup.
5539 -+ * @next_in_service points to the active entity of the sched_data
5540 -+ * service trees that will be scheduled next.
5541 -+ *
5542 -+ * The supported ioprio_classes are the same as in CFQ, in descending
5543 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5544 -+ * Requests from higher priority queues are served before all the
5545 -+ * requests from lower priority queues; among requests of the same
5546 -+ * queue requests are served according to B-WF2Q+.
5547 -+ * All the fields are protected by the queue lock of the containing bfqd.
5548 -+ */
5549 -+struct bfq_sched_data {
5550 -+ struct bfq_entity *in_service_entity;
5551 -+ struct bfq_entity *next_in_service;
5552 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5553 -+};
5554 -+
5555 -+/**
5556 -+ * struct bfq_entity - schedulable entity.
5557 -+ * @rb_node: service_tree member.
5558 -+ * @on_st: flag, true if the entity is on a tree (either the active or
5559 -+ * the idle one of its service_tree).
5560 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5561 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
5562 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5563 -+ * @min_start: minimum start time of the (active) subtree rooted at
5564 -+ * this entity; used for O(log N) lookups into active trees.
5565 -+ * @service: service received during the last round of service.
5566 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5567 -+ * @weight: weight of the queue
5568 -+ * @parent: parent entity, for hierarchical scheduling.
5569 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5570 -+ * associated scheduler queue, %NULL on leaf nodes.
5571 -+ * @sched_data: the scheduler queue this entity belongs to.
5572 -+ * @ioprio: the ioprio in use.
5573 -+ * @new_weight: when a weight change is requested, the new weight value.
5574 -+ * @orig_weight: original weight, used to implement weight boosting
5575 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5576 -+ * @ioprio_class: the ioprio_class in use.
5577 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
5578 -+ * ioprio_class value.
5579 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5580 -+ * ioprio_class change.
5581 -+ *
5582 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5583 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5584 -+ * entity belongs to the sched_data of the parent group in the cgroup
5585 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
5586 -+ * in @my_sched_data.
5587 -+ *
5588 -+ * Each entity stores independently its priority values; this would
5589 -+ * allow different weights on different devices, but this
5590 -+ * functionality is not exported to userspace by now. Priorities and
5591 -+ * weights are updated lazily, first storing the new values into the
5592 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
5593 -+ * there is a transition in the entity state that allows the priority
5594 -+ * update to take place the effective and the requested priority
5595 -+ * values are synchronized.
5596 -+ *
5597 -+ * Unless cgroups are used, the weight value is calculated from the
5598 -+ * ioprio to export the same interface as CFQ. When dealing with
5599 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
5600 -+ * time to consume their budget and have true sequential behavior, and
5601 -+ * when there are no external factors breaking anticipation) the
5602 -+ * relative weights at each level of the cgroups hierarchy should be
5603 -+ * guaranteed. All the fields are protected by the queue lock of the
5604 -+ * containing bfqd.
5605 -+ */
5606 -+struct bfq_entity {
5607 -+ struct rb_node rb_node;
5608 -+
5609 -+ int on_st;
5610 -+
5611 -+ u64 finish;
5612 -+ u64 start;
5613 -+
5614 -+ struct rb_root *tree;
5615 -+
5616 -+ u64 min_start;
5617 -+
5618 -+ unsigned long service, budget;
5619 -+ unsigned short weight, new_weight;
5620 -+ unsigned short orig_weight;
5621 -+
5622 -+ struct bfq_entity *parent;
5623 -+
5624 -+ struct bfq_sched_data *my_sched_data;
5625 -+ struct bfq_sched_data *sched_data;
5626 -+
5627 -+ unsigned short ioprio, new_ioprio;
5628 -+ unsigned short ioprio_class, new_ioprio_class;
5629 -+
5630 -+ int ioprio_changed;
5631 -+};
5632 -+
5633 -+struct bfq_group;
5634 -+
5635 -+/**
5636 -+ * struct bfq_queue - leaf schedulable entity.
5637 -+ * @ref: reference counter.
5638 -+ * @bfqd: parent bfq_data.
5639 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
5640 -+ * one or more other queues.
5641 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5642 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5643 -+ * @sort_list: sorted list of pending requests.
5644 -+ * @next_rq: if fifo isn't expired, next request to serve.
5645 -+ * @queued: nr of requests queued in @sort_list.
5646 -+ * @allocated: currently allocated requests.
5647 -+ * @meta_pending: pending metadata requests.
5648 -+ * @fifo: fifo list of requests in sort_list.
5649 -+ * @entity: entity representing this queue in the scheduler.
5650 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
5651 -+ * @budget_timeout: budget expiration (in jiffies).
5652 -+ * @dispatched: number of requests on the dispatch list or inside driver.
5653 -+ * @org_ioprio: saved ioprio during boosted periods.
5654 -+ * @flags: status flags.
5655 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5656 -+ * @seek_samples: number of seeks sampled
5657 -+ * @seek_total: sum of the distances of the seeks sampled
5658 -+ * @seek_mean: mean seek distance
5659 -+ * @last_request_pos: position of the last request enqueued
5660 -+ * @pid: pid of the process owning the queue, used for logging purposes.
5661 -+ * @last_rais_start_finish: start time of the current weight-raising period if
5662 -+ * the @bfq-queue is being weight-raised, otherwise
5663 -+ * finish time of the last weight-raising period
5664 -+ * @raising_cur_max_time: current max raising time for this queue
5665 -+ * @soft_rt_next_start: minimum time instant such that, only if a new request
5666 -+ * is enqueued after this time instant in an idle
5667 -+ * @bfq_queue with no outstanding requests, then the
5668 -+ * task associated with the queue it is deemed as soft
5669 -+ * real-time (see the comments to the function
5670 -+ * bfq_bfqq_softrt_next_start())
5671 -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
5672 -+ * idle to backlogged
5673 -+ * @service_from_backlogged: cumulative service received from the @bfq_queue
5674 -+ * since the last transition from idle to backlogged
5675 -+ *
5676 -+ * A bfq_queue is a leaf request queue; it can be associated with an io_context
5677 -+ * or more, if it is async or shared between cooperating processes. @cgroup
5678 -+ * holds a reference to the cgroup, to be sure that it does not disappear while
5679 -+ * a bfqq still references it (mostly to avoid races between request issuing and
5680 -+ * task migration followed by cgroup destruction).
5681 -+ * All the fields are protected by the queue lock of the containing bfqd.
5682 -+ */
5683 -+struct bfq_queue {
5684 -+ atomic_t ref;
5685 -+ struct bfq_data *bfqd;
5686 -+
5687 -+ /* fields for cooperating queues handling */
5688 -+ struct bfq_queue *new_bfqq;
5689 -+ struct rb_node pos_node;
5690 -+ struct rb_root *pos_root;
5691 -+
5692 -+ struct rb_root sort_list;
5693 -+ struct request *next_rq;
5694 -+ int queued[2];
5695 -+ int allocated[2];
5696 -+ int meta_pending;
5697 -+ struct list_head fifo;
5698 -+
5699 -+ struct bfq_entity entity;
5700 -+
5701 -+ unsigned long max_budget;
5702 -+ unsigned long budget_timeout;
5703 -+
5704 -+ int dispatched;
5705 -+
5706 -+ unsigned short org_ioprio;
5707 -+
5708 -+ unsigned int flags;
5709 -+
5710 -+ struct list_head bfqq_list;
5711 -+
5712 -+ unsigned int seek_samples;
5713 -+ u64 seek_total;
5714 -+ sector_t seek_mean;
5715 -+ sector_t last_request_pos;
5716 -+
5717 -+ pid_t pid;
5718 -+
5719 -+ /* weight-raising fields */
5720 -+ unsigned long raising_cur_max_time;
5721 -+ unsigned long soft_rt_next_start;
5722 -+ unsigned long last_rais_start_finish;
5723 -+ unsigned int raising_coeff;
5724 -+ unsigned long last_idle_bklogged;
5725 -+ unsigned long service_from_backlogged;
5726 -+};
5727 -+
5728 -+/**
5729 -+ * struct bfq_ttime - per process thinktime stats.
5730 -+ * @ttime_total: total process thinktime
5731 -+ * @ttime_samples: number of thinktime samples
5732 -+ * @ttime_mean: average process thinktime
5733 -+ */
5734 -+struct bfq_ttime {
5735 -+ unsigned long last_end_request;
5736 -+
5737 -+ unsigned long ttime_total;
5738 -+ unsigned long ttime_samples;
5739 -+ unsigned long ttime_mean;
5740 -+};
5741 -+
5742 -+/**
5743 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
5744 -+ * @icq: associated io_cq structure
5745 -+ * @bfqq: array of two process queues, the sync and the async
5746 -+ * @ttime: associated @bfq_ttime struct
5747 -+ */
5748 -+struct bfq_io_cq {
5749 -+ struct io_cq icq; /* must be the first member */
5750 -+ struct bfq_queue *bfqq[2];
5751 -+ struct bfq_ttime ttime;
5752 -+ int ioprio;
5753 -+};
5754 -+
5755 -+/**
5756 -+ * struct bfq_data - per device data structure.
5757 -+ * @queue: request queue for the managed device.
5758 -+ * @root_group: root bfq_group for the device.
5759 -+ * @rq_pos_tree: rbtree sorted by next_request position,
5760 -+ * used when determining if two or more queues
5761 -+ * have interleaving requests (see bfq_close_cooperator).
5762 -+ * @busy_queues: number of bfq_queues containing requests (including the
5763 -+ * queue under service, even if it is idling).
5764 -+ * @raised_busy_queues: number of weight-raised busy bfq_queues.
5765 -+ * @queued: number of queued requests.
5766 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
5767 -+ * @sync_flight: number of sync requests in the driver.
5768 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5769 -+ * completed requests .
5770 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5771 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5772 -+ * @budgets_assigned: number of budgets assigned.
5773 -+ * @idle_slice_timer: timer set when idling for the next sequential request
5774 -+ * from the queue under service.
5775 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
5776 -+ * @in_service_queue: bfq_queue under service.
5777 -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
5778 -+ * @last_position: on-disk position of the last served request.
5779 -+ * @last_budget_start: beginning of the last budget.
5780 -+ * @last_idling_start: beginning of the last idle slice.
5781 -+ * @peak_rate: peak transfer rate observed for a budget.
5782 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5783 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5784 -+ * @group_list: list of all the bfq_groups active on the device.
5785 -+ * @active_list: list of all the bfq_queues active on the device.
5786 -+ * @idle_list: list of all the bfq_queues idle on the device.
5787 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
5788 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5789 -+ * requests are served in fifo order.
5790 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5791 -+ * @bfq_back_max: maximum allowed backward seek.
5792 -+ * @bfq_slice_idle: maximum idling time.
5793 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5794 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5795 -+ * async queues.
5796 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5797 -+ * to prevent seeky queues to impose long latencies to well
5798 -+ * behaved ones (this also implies that seeky queues cannot
5799 -+ * receive guarantees in the service domain; after a timeout
5800 -+ * they are charged for the whole allocated budget, to try
5801 -+ * to preserve a behavior reasonably fair among them, but
5802 -+ * without service-domain guarantees).
5803 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5804 -+ * queue is multiplied
5805 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5806 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5807 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5808 -+ * may be reactivated for a queue (in jiffies)
5809 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5810 -+ * after which weight-raising may be
5811 -+ * reactivated for an already busy queue
5812 -+ * (in jiffies)
5813 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5814 -+ * sectors per seconds
5815 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
5816 -+ * duration of the weight raising automatically
5817 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5818 -+ *
5819 -+ * All the fields are protected by the @queue lock.
5820 -+ */
5821 -+struct bfq_data {
5822 -+ struct request_queue *queue;
5823 -+
5824 -+ struct bfq_group *root_group;
5825 -+
5826 -+ struct rb_root rq_pos_tree;
5827 -+
5828 -+ int busy_queues;
5829 -+ int raised_busy_queues;
5830 -+ int queued;
5831 -+ int rq_in_driver;
5832 -+ int sync_flight;
5833 -+
5834 -+ int max_rq_in_driver;
5835 -+ int hw_tag_samples;
5836 -+ int hw_tag;
5837 -+
5838 -+ int budgets_assigned;
5839 -+
5840 -+ struct timer_list idle_slice_timer;
5841 -+ struct work_struct unplug_work;
5842 -+
5843 -+ struct bfq_queue *in_service_queue;
5844 -+ struct bfq_io_cq *in_service_bic;
5845 -+
5846 -+ sector_t last_position;
5847 -+
5848 -+ ktime_t last_budget_start;
5849 -+ ktime_t last_idling_start;
5850 -+ int peak_rate_samples;
5851 -+ u64 peak_rate;
5852 -+ unsigned long bfq_max_budget;
5853 -+
5854 -+ struct hlist_head group_list;
5855 -+ struct list_head active_list;
5856 -+ struct list_head idle_list;
5857 -+
5858 -+ unsigned int bfq_quantum;
5859 -+ unsigned int bfq_fifo_expire[2];
5860 -+ unsigned int bfq_back_penalty;
5861 -+ unsigned int bfq_back_max;
5862 -+ unsigned int bfq_slice_idle;
5863 -+ u64 bfq_class_idle_last_service;
5864 -+
5865 -+ unsigned int bfq_user_max_budget;
5866 -+ unsigned int bfq_max_budget_async_rq;
5867 -+ unsigned int bfq_timeout[2];
5868 -+
5869 -+ bool low_latency;
5870 -+
5871 -+ /* parameters of the low_latency heuristics */
5872 -+ unsigned int bfq_raising_coeff;
5873 -+ unsigned int bfq_raising_max_time;
5874 -+ unsigned int bfq_raising_rt_max_time;
5875 -+ unsigned int bfq_raising_min_idle_time;
5876 -+ unsigned long bfq_raising_min_inter_arr_async;
5877 -+ unsigned int bfq_raising_max_softrt_rate;
5878 -+ u64 RT_prod;
5879 -+
5880 -+ struct bfq_queue oom_bfqq;
5881 -+};
5882 -+
5883 -+enum bfqq_state_flags {
5884 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
5885 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
5886 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
5887 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
5888 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
5889 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
5890 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
5891 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
5892 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
5893 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
5894 -+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
5895 -+};
5896 -+
5897 -+#define BFQ_BFQQ_FNS(name) \
5898 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
5899 -+{ \
5900 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
5901 -+} \
5902 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
5903 -+{ \
5904 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
5905 -+} \
5906 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
5907 -+{ \
5908 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
5909 -+}
5910 -+
5911 -+BFQ_BFQQ_FNS(busy);
5912 -+BFQ_BFQQ_FNS(wait_request);
5913 -+BFQ_BFQQ_FNS(must_alloc);
5914 -+BFQ_BFQQ_FNS(fifo_expire);
5915 -+BFQ_BFQQ_FNS(idle_window);
5916 -+BFQ_BFQQ_FNS(prio_changed);
5917 -+BFQ_BFQQ_FNS(sync);
5918 -+BFQ_BFQQ_FNS(budget_new);
5919 -+BFQ_BFQQ_FNS(coop);
5920 -+BFQ_BFQQ_FNS(split_coop);
5921 -+BFQ_BFQQ_FNS(softrt_update);
5922 -+#undef BFQ_BFQQ_FNS
5923 -+
5924 -+/* Logging facilities. */
5925 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
5926 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
5927 -+
5928 -+#define bfq_log(bfqd, fmt, args...) \
5929 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
5930 -+
5931 -+/* Expiration reasons. */
5932 -+enum bfqq_expiration {
5933 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
5934 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
5935 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
5936 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
5937 -+};
5938 -+
5939 -+#ifdef CONFIG_CGROUP_BFQIO
5940 -+/**
5941 -+ * struct bfq_group - per (device, cgroup) data structure.
5942 -+ * @entity: schedulable entity to insert into the parent group sched_data.
5943 -+ * @sched_data: own sched_data, to contain child entities (they may be
5944 -+ * both bfq_queues and bfq_groups).
5945 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
5946 -+ * list of the containing cgroup's bfqio_cgroup.
5947 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
5948 -+ * of the groups active on the same device; used for cleanup.
5949 -+ * @bfqd: the bfq_data for the device this group acts upon.
5950 -+ * @async_bfqq: array of async queues for all the tasks belonging to
5951 -+ * the group, one queue per ioprio value per ioprio_class,
5952 -+ * except for the idle class that has only one queue.
5953 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
5954 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
5955 -+ * to avoid too many special cases during group creation/migration.
5956 -+ *
5957 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
5958 -+ * there is a set of bfq_groups, each one collecting the lower-level
5959 -+ * entities belonging to the group that are acting on the same device.
5960 -+ *
5961 -+ * Locking works as follows:
5962 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
5963 -+ * via RCU from its readers.
5964 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
5965 -+ * from the readers.
5966 -+ * o All the other fields are protected by the @bfqd queue lock.
5967 -+ */
5968 -+struct bfq_group {
5969 -+ struct bfq_entity entity;
5970 -+ struct bfq_sched_data sched_data;
5971 -+
5972 -+ struct hlist_node group_node;
5973 -+ struct hlist_node bfqd_node;
5974 -+
5975 -+ void *bfqd;
5976 -+
5977 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5978 -+ struct bfq_queue *async_idle_bfqq;
5979 -+
5980 -+ struct bfq_entity *my_entity;
5981 -+};
5982 -+
5983 -+/**
5984 -+ * struct bfqio_cgroup - bfq cgroup data structure.
5985 -+ * @css: subsystem state for bfq in the containing cgroup.
5986 -+ * @online: flag marked when the subsystem is inserted.
5987 -+ * @weight: cgroup weight.
5988 -+ * @ioprio: cgroup ioprio.
5989 -+ * @ioprio_class: cgroup ioprio_class.
5990 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
5991 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
5992 -+ *
5993 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
5994 -+ * @ioprio and @ioprio_class are protected by @lock.
5995 -+ */
5996 -+struct bfqio_cgroup {
5997 -+ struct cgroup_subsys_state css;
5998 -+ bool online;
5999 -+
6000 -+ unsigned short weight, ioprio, ioprio_class;
6001 -+
6002 -+ spinlock_t lock;
6003 -+ struct hlist_head group_data;
6004 -+};
6005 -+#else
6006 -+struct bfq_group {
6007 -+ struct bfq_sched_data sched_data;
6008 -+
6009 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6010 -+ struct bfq_queue *async_idle_bfqq;
6011 -+};
6012 -+#endif
6013 -+
6014 -+static inline struct bfq_service_tree *
6015 -+bfq_entity_service_tree(struct bfq_entity *entity)
6016 -+{
6017 -+ struct bfq_sched_data *sched_data = entity->sched_data;
6018 -+ unsigned int idx = entity->ioprio_class - 1;
6019 -+
6020 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
6021 -+ BUG_ON(sched_data == NULL);
6022 -+
6023 -+ return sched_data->service_tree + idx;
6024 -+}
6025 -+
6026 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
6027 -+ int is_sync)
6028 -+{
6029 -+ return bic->bfqq[!!is_sync];
6030 -+}
6031 -+
6032 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
6033 -+ struct bfq_queue *bfqq, int is_sync)
6034 -+{
6035 -+ bic->bfqq[!!is_sync] = bfqq;
6036 -+}
6037 -+
6038 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
6039 -+{
6040 -+ return bic->icq.q->elevator->elevator_data;
6041 -+}
6042 -+
6043 -+/**
6044 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
6045 -+ * @ptr: a pointer to a bfqd.
6046 -+ * @flags: storage for the flags to be saved.
6047 -+ *
6048 -+ * This function allows bfqg->bfqd to be protected by the
6049 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
6050 -+ * under RCU, so the storage for bfqd is assured to be safe as long
6051 -+ * as the RCU read side critical section does not end. After the
6052 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
6053 -+ * sure that no other writer accessed it. If we raced with a writer,
6054 -+ * the function returns NULL, with the queue unlocked, otherwise it
6055 -+ * returns the dereferenced pointer, with the queue locked.
6056 -+ */
6057 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
6058 -+ unsigned long *flags)
6059 -+{
6060 -+ struct bfq_data *bfqd;
6061 -+
6062 -+ rcu_read_lock();
6063 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
6064 -+
6065 -+ if (bfqd != NULL) {
6066 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
6067 -+ if (*ptr == bfqd)
6068 -+ goto out;
6069 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6070 -+ }
6071 -+
6072 -+ bfqd = NULL;
6073 -+out:
6074 -+ rcu_read_unlock();
6075 -+ return bfqd;
6076 -+}
6077 -+
6078 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
6079 -+ unsigned long *flags)
6080 -+{
6081 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6082 -+}
6083 -+
6084 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
6085 -+static void bfq_put_queue(struct bfq_queue *bfqq);
6086 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
6087 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6088 -+ struct bfq_group *bfqg, int is_sync,
6089 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
6090 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
6091 -+ struct bfq_group *bfqg);
6092 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
6093 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
6094 -+#endif
6095 ---
6096 -1.9.0
6097 -
6098
6099 Added: genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch1
6100 ===================================================================
6101 --- genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch1 (rev 0)
6102 +++ genpatches-2.6/trunk/3.14/5002_BFQ-2-block-introduce-the-BFQ-v7r2-I-O-sched-for-3.14.patch1 2014-04-01 18:44:20 UTC (rev 2727)
6103 @@ -0,0 +1,6065 @@
6104 +From 5055277df59d9280da6b60cf90bed8e5e57dc44d Mon Sep 17 00:00:00 2001
6105 +From: Paolo Valente <paolo.valente@×××××××.it>
6106 +Date: Thu, 9 May 2013 19:10:02 +0200
6107 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r2 I/O sched for 3.14
6108 +
6109 +Add the BFQ-v7r2 I/O scheduler to 3.14.
6110 +The general structure is borrowed from CFQ, as much of the code for
6111 +handling I/O contexts. Over time, several useful features have been
6112 +ported from CFQ as well (details in the changelog in README.BFQ). A
6113 +(bfq_)queue is associated to each task doing I/O on a device, and each
6114 +time a scheduling decision has to be made a queue is selected and served
6115 +until it expires.
6116 +
6117 + - Slices are given in the service domain: tasks are assigned
6118 + budgets, measured in number of sectors. Once got the disk, a task
6119 + must however consume its assigned budget within a configurable
6120 + maximum time (by default, the maximum possible value of the
6121 + budgets is automatically computed to comply with this timeout).
6122 + This allows the desired latency vs "throughput boosting" tradeoff
6123 + to be set.
6124 +
6125 + - Budgets are scheduled according to a variant of WF2Q+, implemented
6126 + using an augmented rb-tree to take eligibility into account while
6127 + preserving an O(log N) overall complexity.
6128 +
6129 + - A low-latency tunable is provided; if enabled, both interactive
6130 + and soft real-time applications are guaranteed a very low latency.
6131 +
6132 + - Latency guarantees are preserved also in the presence of NCQ.
6133 +
6134 + - Also with flash-based devices, a high throughput is achieved
6135 + while still preserving latency guarantees.
6136 +
6137 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
6138 + cooperating-queue-merging and the preemption mechanisms present
6139 + in CFQ. EQM is in fact a unified mechanism that tries to get a
6140 + sequential read pattern, and hence a high throughput, with any
6141 + set of processes performing interleaved I/O over a contiguous
6142 + sequence of sectors.
6143 +
6144 + - BFQ supports full hierarchical scheduling, exporting a cgroups
6145 + interface. Since each node has a full scheduler, each group can
6146 + be assigned its own weight.
6147 +
6148 + - If the cgroups interface is not used, only I/O priorities can be
6149 + assigned to processes, with ioprio values mapped to weights
6150 + with the relation weight = IOPRIO_BE_NR - ioprio.
6151 +
6152 + - ioprio classes are served in strict priority order, i.e., lower
6153 + priority queues are not served as long as there are higher
6154 + priority queues. Among queues in the same class the bandwidth is
6155 + distributed in proportion to the weight of each queue. A very
6156 + thin extra bandwidth is however guaranteed to the Idle class, to
6157 + prevent it from starving.
6158 +
6159 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
6160 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
6161 +---
6162 + block/bfq-cgroup.c | 926 +++++++++++++++
6163 + block/bfq-ioc.c | 36 +
6164 + block/bfq-iosched.c | 3300 +++++++++++++++++++++++++++++++++++++++++++++++++++
6165 + block/bfq-sched.c | 1078 +++++++++++++++++
6166 + block/bfq.h | 622 ++++++++++
6167 + 5 files changed, 5962 insertions(+)
6168 + create mode 100644 block/bfq-cgroup.c
6169 + create mode 100644 block/bfq-ioc.c
6170 + create mode 100644 block/bfq-iosched.c
6171 + create mode 100644 block/bfq-sched.c
6172 + create mode 100644 block/bfq.h
6173 +
6174 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
6175 +new file mode 100644
6176 +index 0000000..bcecdb4
6177 +--- /dev/null
6178 ++++ b/block/bfq-cgroup.c
6179 +@@ -0,0 +1,926 @@
6180 ++/*
6181 ++ * BFQ: CGROUPS support.
6182 ++ *
6183 ++ * Based on ideas and code from CFQ:
6184 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
6185 ++ *
6186 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
6187 ++ * Paolo Valente <paolo.valente@×××××××.it>
6188 ++ *
6189 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
6190 ++ *
6191 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
6192 ++ */
6193 ++
6194 ++#ifdef CONFIG_CGROUP_BFQIO
6195 ++
6196 ++static DEFINE_MUTEX(bfqio_mutex);
6197 ++
6198 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
6199 ++{
6200 ++ return bgrp ? !bgrp->online : false;
6201 ++}
6202 ++
6203 ++static struct bfqio_cgroup bfqio_root_cgroup = {
6204 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
6205 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
6206 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
6207 ++};
6208 ++
6209 ++static inline void bfq_init_entity(struct bfq_entity *entity,
6210 ++ struct bfq_group *bfqg)
6211 ++{
6212 ++ entity->weight = entity->new_weight;
6213 ++ entity->orig_weight = entity->new_weight;
6214 ++ entity->ioprio = entity->new_ioprio;
6215 ++ entity->ioprio_class = entity->new_ioprio_class;
6216 ++ entity->parent = bfqg->my_entity;
6217 ++ entity->sched_data = &bfqg->sched_data;
6218 ++}
6219 ++
6220 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
6221 ++{
6222 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
6223 ++}
6224 ++
6225 ++/*
6226 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
6227 ++ * of bgrp. Must be called under rcu_read_lock().
6228 ++ */
6229 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
6230 ++ struct bfq_data *bfqd)
6231 ++{
6232 ++ struct bfq_group *bfqg;
6233 ++ void *key;
6234 ++
6235 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
6236 ++ key = rcu_dereference(bfqg->bfqd);
6237 ++ if (key == bfqd)
6238 ++ return bfqg;
6239 ++ }
6240 ++
6241 ++ return NULL;
6242 ++}
6243 ++
6244 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
6245 ++ struct bfq_group *bfqg)
6246 ++{
6247 ++ struct bfq_entity *entity = &bfqg->entity;
6248 ++
6249 ++ /*
6250 ++ * If the weight of the entity has never been set via the sysfs
6251 ++ * interface, then bgrp->weight == 0. In this case we initialize
6252 ++ * the weight from the current ioprio value. Otherwise, the group
6253 ++ * weight, if set, has priority over the ioprio value.
6254 ++ */
6255 ++ if (bgrp->weight == 0) {
6256 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
6257 ++ entity->new_ioprio = bgrp->ioprio;
6258 ++ } else {
6259 ++ entity->new_weight = bgrp->weight;
6260 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
6261 ++ }
6262 ++ entity->orig_weight = entity->weight = entity->new_weight;
6263 ++ entity->ioprio = entity->new_ioprio;
6264 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
6265 ++ entity->my_sched_data = &bfqg->sched_data;
6266 ++}
6267 ++
6268 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
6269 ++ struct bfq_group *parent)
6270 ++{
6271 ++ struct bfq_entity *entity;
6272 ++
6273 ++ BUG_ON(parent == NULL);
6274 ++ BUG_ON(bfqg == NULL);
6275 ++
6276 ++ entity = &bfqg->entity;
6277 ++ entity->parent = parent->my_entity;
6278 ++ entity->sched_data = &parent->sched_data;
6279 ++}
6280 ++
6281 ++/**
6282 ++ * bfq_group_chain_alloc - allocate a chain of groups.
6283 ++ * @bfqd: queue descriptor.
6284 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
6285 ++ *
6286 ++ * Allocate a chain of groups starting from the one belonging to
6287 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
6288 ++ * to the root has already an allocated group on @bfqd.
6289 ++ */
6290 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
6291 ++ struct cgroup_subsys_state *css)
6292 ++{
6293 ++ struct bfqio_cgroup *bgrp;
6294 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
6295 ++
6296 ++ for (; css != NULL; css = css->parent) {
6297 ++ bgrp = css_to_bfqio(css);
6298 ++
6299 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6300 ++ if (bfqg != NULL) {
6301 ++ /*
6302 ++ * All the cgroups in the path from there to the
6303 ++ * root must have a bfq_group for bfqd, so we don't
6304 ++ * need any more allocations.
6305 ++ */
6306 ++ break;
6307 ++ }
6308 ++
6309 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
6310 ++ if (bfqg == NULL)
6311 ++ goto cleanup;
6312 ++
6313 ++ bfq_group_init_entity(bgrp, bfqg);
6314 ++ bfqg->my_entity = &bfqg->entity;
6315 ++
6316 ++ if (leaf == NULL) {
6317 ++ leaf = bfqg;
6318 ++ prev = leaf;
6319 ++ } else {
6320 ++ bfq_group_set_parent(prev, bfqg);
6321 ++ /*
6322 ++ * Build a list of allocated nodes using the bfqd
6323 ++ * filed, that is still unused and will be initialized
6324 ++ * only after the node will be connected.
6325 ++ */
6326 ++ prev->bfqd = bfqg;
6327 ++ prev = bfqg;
6328 ++ }
6329 ++ }
6330 ++
6331 ++ return leaf;
6332 ++
6333 ++cleanup:
6334 ++ while (leaf != NULL) {
6335 ++ prev = leaf;
6336 ++ leaf = leaf->bfqd;
6337 ++ kfree(prev);
6338 ++ }
6339 ++
6340 ++ return NULL;
6341 ++}
6342 ++
6343 ++/**
6344 ++ * bfq_group_chain_link - link an allocated group chain to a cgroup hierarchy.
6345 ++ * @bfqd: the queue descriptor.
6346 ++ * @css: the leaf cgroup_subsys_state to start from.
6347 ++ * @leaf: the leaf group (to be associated to @cgroup).
6348 ++ *
6349 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
6350 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
6351 ++ * hierarchy that already as a group associated to @bfqd all the nodes
6352 ++ * in the path to the root cgroup have one too.
6353 ++ *
6354 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
6355 ++ * per device) while the bfqio_cgroup lock protects the list of groups
6356 ++ * belonging to the same cgroup.
6357 ++ */
6358 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
6359 ++ struct cgroup_subsys_state *css,
6360 ++ struct bfq_group *leaf)
6361 ++{
6362 ++ struct bfqio_cgroup *bgrp;
6363 ++ struct bfq_group *bfqg, *next, *prev = NULL;
6364 ++ unsigned long flags;
6365 ++
6366 ++ assert_spin_locked(bfqd->queue->queue_lock);
6367 ++
6368 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
6369 ++ bgrp = css_to_bfqio(css);
6370 ++ next = leaf->bfqd;
6371 ++
6372 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6373 ++ BUG_ON(bfqg != NULL);
6374 ++
6375 ++ spin_lock_irqsave(&bgrp->lock, flags);
6376 ++
6377 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
6378 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
6379 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
6380 ++
6381 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
6382 ++
6383 ++ prev = leaf;
6384 ++ leaf = next;
6385 ++ }
6386 ++
6387 ++ BUG_ON(css == NULL && leaf != NULL);
6388 ++ if (css != NULL && prev != NULL) {
6389 ++ bgrp = css_to_bfqio(css);
6390 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6391 ++ bfq_group_set_parent(prev, bfqg);
6392 ++ }
6393 ++}
6394 ++
6395 ++/**
6396 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
6397 ++ * @bfqd: queue descriptor.
6398 ++ * @cgroup: cgroup being searched for.
6399 ++ *
6400 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
6401 ++ * necessary. When a group is returned all the cgroups in the path
6402 ++ * to the root have a group associated to @bfqd.
6403 ++ *
6404 ++ * If the allocation fails, return the root group: this breaks guarantees
6405 ++ * but is a safe fallback. If this loss becomes a problem it can be
6406 ++ * mitigated using the equivalent weight (given by the product of the
6407 ++ * weights of the groups in the path from @group to the root) in the
6408 ++ * root scheduler.
6409 ++ *
6410 ++ * We allocate all the missing nodes in the path from the leaf cgroup
6411 ++ * to the root and we connect the nodes only after all the allocations
6412 ++ * have been successful.
6413 ++ */
6414 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
6415 ++ struct cgroup_subsys_state *css)
6416 ++{
6417 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
6418 ++ struct bfq_group *bfqg;
6419 ++
6420 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6421 ++ if (bfqg != NULL)
6422 ++ return bfqg;
6423 ++
6424 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
6425 ++ if (bfqg != NULL)
6426 ++ bfq_group_chain_link(bfqd, css, bfqg);
6427 ++ else
6428 ++ bfqg = bfqd->root_group;
6429 ++
6430 ++ return bfqg;
6431 ++}
6432 ++
6433 ++/**
6434 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
6435 ++ * @bfqd: queue descriptor.
6436 ++ * @bfqq: the queue to move.
6437 ++ * @entity: @bfqq's entity.
6438 ++ * @bfqg: the group to move to.
6439 ++ *
6440 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
6441 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
6442 ++ *
6443 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
6444 ++ * not disappear (by now this just means that we are called under
6445 ++ * rcu_read_lock()).
6446 ++ */
6447 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6448 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
6449 ++{
6450 ++ int busy, resume;
6451 ++
6452 ++ busy = bfq_bfqq_busy(bfqq);
6453 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
6454 ++
6455 ++ BUG_ON(resume && !entity->on_st);
6456 ++ BUG_ON(busy && !resume && entity->on_st &&
6457 ++ bfqq != bfqd->in_service_queue);
6458 ++
6459 ++ if (busy) {
6460 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
6461 ++
6462 ++ if (!resume)
6463 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
6464 ++ else
6465 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
6466 ++ } else if (entity->on_st)
6467 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
6468 ++
6469 ++ /*
6470 ++ * Here we use a reference to bfqg. We don't need a refcounter
6471 ++ * as the cgroup reference will not be dropped, so that its
6472 ++ * destroy() callback will not be invoked.
6473 ++ */
6474 ++ entity->parent = bfqg->my_entity;
6475 ++ entity->sched_data = &bfqg->sched_data;
6476 ++
6477 ++ if (busy && resume)
6478 ++ bfq_activate_bfqq(bfqd, bfqq);
6479 ++
6480 ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
6481 ++ bfq_schedule_dispatch(bfqd);
6482 ++}
6483 ++
6484 ++/**
6485 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
6486 ++ * @bfqd: the queue descriptor.
6487 ++ * @bic: the bic to move.
6488 ++ * @cgroup: the cgroup to move to.
6489 ++ *
6490 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
6491 ++ * has to make sure that the reference to cgroup is valid across the call.
6492 ++ *
6493 ++ * NOTE: an alternative approach might have been to store the current
6494 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
6495 ++ * time here, at the price of slightly more complex code.
6496 ++ */
6497 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
6498 ++ struct bfq_io_cq *bic,
6499 ++ struct cgroup_subsys_state *css)
6500 ++{
6501 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
6502 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
6503 ++ struct bfq_entity *entity;
6504 ++ struct bfq_group *bfqg;
6505 ++ struct bfqio_cgroup *bgrp;
6506 ++
6507 ++ bgrp = css_to_bfqio(css);
6508 ++
6509 ++ bfqg = bfq_find_alloc_group(bfqd, css);
6510 ++ if (async_bfqq != NULL) {
6511 ++ entity = &async_bfqq->entity;
6512 ++
6513 ++ if (entity->sched_data != &bfqg->sched_data) {
6514 ++ bic_set_bfqq(bic, NULL, 0);
6515 ++ bfq_log_bfqq(bfqd, async_bfqq,
6516 ++ "bic_change_group: %p %d",
6517 ++ async_bfqq, atomic_read(&async_bfqq->ref));
6518 ++ bfq_put_queue(async_bfqq);
6519 ++ }
6520 ++ }
6521 ++
6522 ++ if (sync_bfqq != NULL) {
6523 ++ entity = &sync_bfqq->entity;
6524 ++ if (entity->sched_data != &bfqg->sched_data)
6525 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
6526 ++ }
6527 ++
6528 ++ return bfqg;
6529 ++}
6530 ++
6531 ++/**
6532 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
6533 ++ * @bic: the bic being migrated.
6534 ++ * @cgroup: the destination cgroup.
6535 ++ *
6536 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
6537 ++ * moved into its new parent group.
6538 ++ */
6539 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
6540 ++ struct cgroup_subsys_state *css)
6541 ++{
6542 ++ struct bfq_data *bfqd;
6543 ++ unsigned long uninitialized_var(flags);
6544 ++
6545 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
6546 ++ &flags);
6547 ++ if (bfqd != NULL) {
6548 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
6549 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6550 ++ }
6551 ++}
6552 ++
6553 ++/**
6554 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
6555 ++ * @bic: the @bic to update.
6556 ++ *
6557 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
6558 ++ * We need this in addition to moving bics during the cgroup attach
6559 ++ * phase because the task owning @bic could be at its first disk
6560 ++ * access or we may end up in the root cgroup as the result of a
6561 ++ * memory allocation failure and here we try to move to the right
6562 ++ * group.
6563 ++ *
6564 ++ * Must be called under the queue lock. It is safe to use the returned
6565 ++ * value even after the rcu_read_unlock() as the migration/destruction
6566 ++ * paths act under the queue lock too. IOW it is impossible to race with
6567 ++ * group migration/destruction and end up with an invalid group as:
6568 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
6569 ++ * has started execution, as current holds a reference to it,
6570 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
6571 ++ * migrated to a different cgroup] its attach() callback will have
6572 ++ * taken care of remove all the references to the old cgroup data.
6573 ++ */
6574 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
6575 ++{
6576 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
6577 ++ struct bfq_group *bfqg;
6578 ++ struct cgroup_subsys_state *css;
6579 ++
6580 ++ BUG_ON(bfqd == NULL);
6581 ++
6582 ++ rcu_read_lock();
6583 ++ css = task_css(current, bfqio_subsys_id);
6584 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
6585 ++ rcu_read_unlock();
6586 ++
6587 ++ return bfqg;
6588 ++}
6589 ++
6590 ++/**
6591 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
6592 ++ * @st: the service tree being flushed.
6593 ++ */
6594 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
6595 ++{
6596 ++ struct bfq_entity *entity = st->first_idle;
6597 ++
6598 ++ for (; entity != NULL; entity = st->first_idle)
6599 ++ __bfq_deactivate_entity(entity, 0);
6600 ++}
6601 ++
6602 ++/**
6603 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
6604 ++ * @bfqd: the device data structure with the root group.
6605 ++ * @entity: the entity to move.
6606 ++ */
6607 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
6608 ++ struct bfq_entity *entity)
6609 ++{
6610 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
6611 ++
6612 ++ BUG_ON(bfqq == NULL);
6613 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
6614 ++ return;
6615 ++}
6616 ++
6617 ++/**
6618 ++ * bfq_reparent_active_entities - move to the root group all active entities.
6619 ++ * @bfqd: the device data structure with the root group.
6620 ++ * @bfqg: the group to move from.
6621 ++ * @st: the service tree with the entities.
6622 ++ *
6623 ++ * Needs queue_lock to be taken and reference to be valid over the call.
6624 ++ */
6625 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
6626 ++ struct bfq_group *bfqg,
6627 ++ struct bfq_service_tree *st)
6628 ++{
6629 ++ struct rb_root *active = &st->active;
6630 ++ struct bfq_entity *entity = NULL;
6631 ++
6632 ++ if (!RB_EMPTY_ROOT(&st->active))
6633 ++ entity = bfq_entity_of(rb_first(active));
6634 ++
6635 ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
6636 ++ bfq_reparent_leaf_entity(bfqd, entity);
6637 ++
6638 ++ if (bfqg->sched_data.in_service_entity != NULL)
6639 ++ bfq_reparent_leaf_entity(bfqd,
6640 ++ bfqg->sched_data.in_service_entity);
6641 ++
6642 ++ return;
6643 ++}
6644 ++
6645 ++/**
6646 ++ * bfq_destroy_group - destroy @bfqg.
6647 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
6648 ++ * @bfqg: the group being destroyed.
6649 ++ *
6650 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
6651 ++ */
6652 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
6653 ++{
6654 ++ struct bfq_data *bfqd;
6655 ++ struct bfq_service_tree *st;
6656 ++ struct bfq_entity *entity = bfqg->my_entity;
6657 ++ unsigned long uninitialized_var(flags);
6658 ++ int i;
6659 ++
6660 ++ hlist_del(&bfqg->group_node);
6661 ++
6662 ++ /*
6663 ++ * Empty all service_trees belonging to this group before deactivating
6664 ++ * the group itself.
6665 ++ */
6666 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
6667 ++ st = bfqg->sched_data.service_tree + i;
6668 ++
6669 ++ /*
6670 ++ * The idle tree may still contain bfq_queues belonging
6671 ++ * to exited task because they never migrated to a different
6672 ++ * cgroup from the one being destroyed now. No one else
6673 ++ * can access them so it's safe to act without any lock.
6674 ++ */
6675 ++ bfq_flush_idle_tree(st);
6676 ++
6677 ++ /*
6678 ++ * It may happen that some queues are still active
6679 ++ * (busy) upon group destruction (if the corresponding
6680 ++ * processes have been forced to terminate). We move
6681 ++ * all the leaf entities corresponding to these queues
6682 ++ * to the root_group.
6683 ++ * Also, it may happen that the group has an entity
6684 ++ * under service, which is disconnected from the active
6685 ++ * tree: it must be moved, too.
6686 ++ * There is no need to put the sync queues, as the
6687 ++ * scheduler has taken no reference.
6688 ++ */
6689 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6690 ++ if (bfqd != NULL) {
6691 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
6692 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6693 ++ }
6694 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
6695 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
6696 ++ }
6697 ++ BUG_ON(bfqg->sched_data.next_in_service != NULL);
6698 ++ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
6699 ++
6700 ++ /*
6701 ++ * We may race with device destruction, take extra care when
6702 ++ * dereferencing bfqg->bfqd.
6703 ++ */
6704 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6705 ++ if (bfqd != NULL) {
6706 ++ hlist_del(&bfqg->bfqd_node);
6707 ++ __bfq_deactivate_entity(entity, 0);
6708 ++ bfq_put_async_queues(bfqd, bfqg);
6709 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6710 ++ }
6711 ++ BUG_ON(entity->tree != NULL);
6712 ++
6713 ++ /*
6714 ++ * No need to defer the kfree() to the end of the RCU grace
6715 ++ * period: we are called from the destroy() callback of our
6716 ++ * cgroup, so we can be sure that no one is a) still using
6717 ++ * this cgroup or b) doing lookups in it.
6718 ++ */
6719 ++ kfree(bfqg);
6720 ++}
6721 ++
6722 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
6723 ++{
6724 ++ struct hlist_node *tmp;
6725 ++ struct bfq_group *bfqg;
6726 ++
6727 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
6728 ++ bfq_end_raising_async_queues(bfqd, bfqg);
6729 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
6730 ++}
6731 ++
6732 ++/**
6733 ++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.
6734 ++ * @bfqd: the device descriptor being exited.
6735 ++ *
6736 ++ * When the device exits we just make sure that no lookup can return
6737 ++ * the now unused group structures. They will be deallocated on cgroup
6738 ++ * destruction.
6739 ++ */
6740 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
6741 ++{
6742 ++ struct hlist_node *tmp;
6743 ++ struct bfq_group *bfqg;
6744 ++
6745 ++ bfq_log(bfqd, "disconnect_groups beginning");
6746 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
6747 ++ hlist_del(&bfqg->bfqd_node);
6748 ++
6749 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
6750 ++
6751 ++ /*
6752 ++ * Don't remove from the group hash, just set an
6753 ++ * invalid key. No lookups can race with the
6754 ++ * assignment as bfqd is being destroyed; this
6755 ++ * implies also that new elements cannot be added
6756 ++ * to the list.
6757 ++ */
6758 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
6759 ++
6760 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
6761 ++ bfqg);
6762 ++ bfq_put_async_queues(bfqd, bfqg);
6763 ++ }
6764 ++}
6765 ++
6766 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
6767 ++{
6768 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
6769 ++ struct bfq_group *bfqg = bfqd->root_group;
6770 ++
6771 ++ bfq_put_async_queues(bfqd, bfqg);
6772 ++
6773 ++ spin_lock_irq(&bgrp->lock);
6774 ++ hlist_del_rcu(&bfqg->group_node);
6775 ++ spin_unlock_irq(&bgrp->lock);
6776 ++
6777 ++ /*
6778 ++ * No need to synchronize_rcu() here: since the device is gone
6779 ++ * there cannot be any read-side access to its root_group.
6780 ++ */
6781 ++ kfree(bfqg);
6782 ++}
6783 ++
6784 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
6785 ++{
6786 ++ struct bfq_group *bfqg;
6787 ++ struct bfqio_cgroup *bgrp;
6788 ++ int i;
6789 ++
6790 ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
6791 ++ if (bfqg == NULL)
6792 ++ return NULL;
6793 ++
6794 ++ bfqg->entity.parent = NULL;
6795 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6796 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6797 ++
6798 ++ bgrp = &bfqio_root_cgroup;
6799 ++ spin_lock_irq(&bgrp->lock);
6800 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
6801 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
6802 ++ spin_unlock_irq(&bgrp->lock);
6803 ++
6804 ++ return bfqg;
6805 ++}
6806 ++
6807 ++#define SHOW_FUNCTION(__VAR) \
6808 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
6809 ++ struct cftype *cftype) \
6810 ++{ \
6811 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
6812 ++ u64 ret = -ENODEV; \
6813 ++ \
6814 ++ mutex_lock(&bfqio_mutex); \
6815 ++ if (bfqio_is_removed(bgrp)) \
6816 ++ goto out_unlock; \
6817 ++ \
6818 ++ spin_lock_irq(&bgrp->lock); \
6819 ++ ret = bgrp->__VAR; \
6820 ++ spin_unlock_irq(&bgrp->lock); \
6821 ++ \
6822 ++out_unlock: \
6823 ++ mutex_unlock(&bfqio_mutex); \
6824 ++ return ret; \
6825 ++}
6826 ++
6827 ++SHOW_FUNCTION(weight);
6828 ++SHOW_FUNCTION(ioprio);
6829 ++SHOW_FUNCTION(ioprio_class);
6830 ++#undef SHOW_FUNCTION
6831 ++
6832 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
6833 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
6834 ++ struct cftype *cftype, \
6835 ++ u64 val) \
6836 ++{ \
6837 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
6838 ++ struct bfq_group *bfqg; \
6839 ++ int ret = -EINVAL; \
6840 ++ \
6841 ++ if (val < (__MIN) || val > (__MAX)) \
6842 ++ return ret; \
6843 ++ \
6844 ++ ret = -ENODEV; \
6845 ++ mutex_lock(&bfqio_mutex); \
6846 ++ if (bfqio_is_removed(bgrp)) \
6847 ++ goto out_unlock; \
6848 ++ ret = 0; \
6849 ++ \
6850 ++ spin_lock_irq(&bgrp->lock); \
6851 ++ bgrp->__VAR = (unsigned short)val; \
6852 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
6853 ++ /* \
6854 ++ * Setting the ioprio_changed flag of the entity \
6855 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
6856 ++ * the value of the weight to its ioprio mapping. \
6857 ++ * Set the flag only if necessary. \
6858 ++ */ \
6859 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
6860 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
6861 ++ /* \
6862 ++ * Make sure that the above new value has been \
6863 ++ * stored in bfqg->entity.new_##__VAR before \
6864 ++ * setting the ioprio_changed flag. In fact, \
6865 ++ * this flag may be read asynchronously (in \
6866 ++ * critical sections protected by a different \
6867 ++ * lock than that held here), and finding this \
6868 ++ * flag set may cause the execution of the code \
6869 ++ * for updating parameters whose value may \
6870 ++ * depend also on bfqg->entity.new_##__VAR (in \
6871 ++ * __bfq_entity_update_weight_prio). \
6872 ++ * This barrier makes sure that the new value \
6873 ++ * of bfqg->entity.new_##__VAR is correctly \
6874 ++ * seen in that code. \
6875 ++ */ \
6876 ++ smp_wmb(); \
6877 ++ bfqg->entity.ioprio_changed = 1; \
6878 ++ } \
6879 ++ } \
6880 ++ spin_unlock_irq(&bgrp->lock); \
6881 ++ \
6882 ++out_unlock: \
6883 ++ mutex_unlock(&bfqio_mutex); \
6884 ++ return ret; \
6885 ++}
6886 ++
6887 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
6888 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
6889 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
6890 ++#undef STORE_FUNCTION
6891 ++
6892 ++static struct cftype bfqio_files[] = {
6893 ++ {
6894 ++ .name = "weight",
6895 ++ .read_u64 = bfqio_cgroup_weight_read,
6896 ++ .write_u64 = bfqio_cgroup_weight_write,
6897 ++ },
6898 ++ {
6899 ++ .name = "ioprio",
6900 ++ .read_u64 = bfqio_cgroup_ioprio_read,
6901 ++ .write_u64 = bfqio_cgroup_ioprio_write,
6902 ++ },
6903 ++ {
6904 ++ .name = "ioprio_class",
6905 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
6906 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
6907 ++ },
6908 ++ { }, /* terminate */
6909 ++};
6910 ++
6911 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
6912 ++ *parent_css)
6913 ++{
6914 ++ struct bfqio_cgroup *bgrp;
6915 ++
6916 ++ if (parent_css != NULL) {
6917 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
6918 ++ if (bgrp == NULL)
6919 ++ return ERR_PTR(-ENOMEM);
6920 ++ } else
6921 ++ bgrp = &bfqio_root_cgroup;
6922 ++
6923 ++ spin_lock_init(&bgrp->lock);
6924 ++ INIT_HLIST_HEAD(&bgrp->group_data);
6925 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
6926 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
6927 ++
6928 ++ return &bgrp->css;
6929 ++}
6930 ++
6931 ++/*
6932 ++ * We cannot support shared io contexts, as we have no means to support
6933 ++ * two tasks with the same ioc in two different groups without major rework
6934 ++ * of the main bic/bfqq data structures. By now we allow a task to change
6935 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
6936 ++ * behavior is that a group containing a task that forked using CLONE_IO
6937 ++ * will not be destroyed until the tasks sharing the ioc die.
6938 ++ */
6939 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
6940 ++ struct cgroup_taskset *tset)
6941 ++{
6942 ++ struct task_struct *task;
6943 ++ struct io_context *ioc;
6944 ++ int ret = 0;
6945 ++
6946 ++ cgroup_taskset_for_each(task, css, tset) {
6947 ++ /*
6948 ++ * task_lock() is needed to avoid races with
6949 ++ * exit_io_context()
6950 ++ */
6951 ++ task_lock(task);
6952 ++ ioc = task->io_context;
6953 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
6954 ++ /*
6955 ++ * ioc == NULL means that the task is either too young
6956 ++ * or exiting: if it has still no ioc the ioc can't be
6957 ++ * shared, if the task is exiting the attach will fail
6958 ++ * anyway, no matter what we return here.
6959 ++ */
6960 ++ ret = -EINVAL;
6961 ++ task_unlock(task);
6962 ++ if (ret)
6963 ++ break;
6964 ++ }
6965 ++
6966 ++ return ret;
6967 ++}
6968 ++
6969 ++static void bfqio_attach(struct cgroup_subsys_state *css,
6970 ++ struct cgroup_taskset *tset)
6971 ++{
6972 ++ struct task_struct *task;
6973 ++ struct io_context *ioc;
6974 ++ struct io_cq *icq;
6975 ++
6976 ++ /*
6977 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
6978 ++ * new group has not yet been tested.
6979 ++ */
6980 ++ cgroup_taskset_for_each(task, css, tset) {
6981 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
6982 ++ if (ioc) {
6983 ++ /*
6984 ++ * Handle cgroup change here.
6985 ++ */
6986 ++ rcu_read_lock();
6987 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
6988 ++ if (!strncmp(
6989 ++ icq->q->elevator->type->elevator_name,
6990 ++ "bfq", ELV_NAME_MAX))
6991 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
6992 ++ css);
6993 ++ rcu_read_unlock();
6994 ++ put_io_context(ioc);
6995 ++ }
6996 ++ }
6997 ++}
6998 ++
6999 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
7000 ++{
7001 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7002 ++ struct hlist_node *tmp;
7003 ++ struct bfq_group *bfqg;
7004 ++
7005 ++ /*
7006 ++ * Since we are destroying the cgroup, there are no more tasks
7007 ++ * referencing it, and all the RCU grace periods that may have
7008 ++ * referenced it are ended (as the destruction of the parent
7009 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
7010 ++ * anything else and we don't need any synchronization.
7011 ++ */
7012 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
7013 ++ bfq_destroy_group(bgrp, bfqg);
7014 ++
7015 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
7016 ++
7017 ++ kfree(bgrp);
7018 ++}
7019 ++
7020 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
7021 ++{
7022 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7023 ++
7024 ++ mutex_lock(&bfqio_mutex);
7025 ++ bgrp->online = true;
7026 ++ mutex_unlock(&bfqio_mutex);
7027 ++
7028 ++ return 0;
7029 ++}
7030 ++
7031 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
7032 ++{
7033 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7034 ++
7035 ++ mutex_lock(&bfqio_mutex);
7036 ++ bgrp->online = false;
7037 ++ mutex_unlock(&bfqio_mutex);
7038 ++}
7039 ++
7040 ++struct cgroup_subsys bfqio_subsys = {
7041 ++ .name = "bfqio",
7042 ++ .css_alloc = bfqio_create,
7043 ++ .css_online = bfqio_css_online,
7044 ++ .css_offline = bfqio_css_offline,
7045 ++ .can_attach = bfqio_can_attach,
7046 ++ .attach = bfqio_attach,
7047 ++ .css_free = bfqio_destroy,
7048 ++ .subsys_id = bfqio_subsys_id,
7049 ++ .base_cftypes = bfqio_files,
7050 ++};
7051 ++#else
7052 ++static inline void bfq_init_entity(struct bfq_entity *entity,
7053 ++ struct bfq_group *bfqg)
7054 ++{
7055 ++ entity->weight = entity->new_weight;
7056 ++ entity->orig_weight = entity->new_weight;
7057 ++ entity->ioprio = entity->new_ioprio;
7058 ++ entity->ioprio_class = entity->new_ioprio_class;
7059 ++ entity->sched_data = &bfqg->sched_data;
7060 ++}
7061 ++
7062 ++static inline struct bfq_group *
7063 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
7064 ++{
7065 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
7066 ++ return bfqd->root_group;
7067 ++}
7068 ++
7069 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
7070 ++ struct bfq_queue *bfqq,
7071 ++ struct bfq_entity *entity,
7072 ++ struct bfq_group *bfqg)
7073 ++{
7074 ++}
7075 ++
7076 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
7077 ++{
7078 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
7079 ++}
7080 ++
7081 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
7082 ++{
7083 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
7084 ++}
7085 ++
7086 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
7087 ++{
7088 ++ kfree(bfqd->root_group);
7089 ++}
7090 ++
7091 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
7092 ++{
7093 ++ struct bfq_group *bfqg;
7094 ++ int i;
7095 ++
7096 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
7097 ++ if (bfqg == NULL)
7098 ++ return NULL;
7099 ++
7100 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
7101 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
7102 ++
7103 ++ return bfqg;
7104 ++}
7105 ++#endif
7106 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
7107 +new file mode 100644
7108 +index 0000000..7f6b000
7109 +--- /dev/null
7110 ++++ b/block/bfq-ioc.c
7111 +@@ -0,0 +1,36 @@
7112 ++/*
7113 ++ * BFQ: I/O context handling.
7114 ++ *
7115 ++ * Based on ideas and code from CFQ:
7116 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7117 ++ *
7118 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7119 ++ * Paolo Valente <paolo.valente@×××××××.it>
7120 ++ *
7121 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7122 ++ */
7123 ++
7124 ++/**
7125 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
7126 ++ * @icq: the iocontext queue.
7127 ++ */
7128 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
7129 ++{
7130 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
7131 ++ return container_of(icq, struct bfq_io_cq, icq);
7132 ++}
7133 ++
7134 ++/**
7135 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
7136 ++ * @bfqd: the lookup key.
7137 ++ * @ioc: the io_context of the process doing I/O.
7138 ++ *
7139 ++ * Queue lock must be held.
7140 ++ */
7141 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
7142 ++ struct io_context *ioc)
7143 ++{
7144 ++ if (ioc)
7145 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
7146 ++ return NULL;
7147 ++}
7148 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
7149 +new file mode 100644
7150 +index 0000000..f5f71e4
7151 +--- /dev/null
7152 ++++ b/block/bfq-iosched.c
7153 +@@ -0,0 +1,3300 @@
7154 ++/*
7155 ++ * Budget Fair Queueing (BFQ) disk scheduler.
7156 ++ *
7157 ++ * Based on ideas and code from CFQ:
7158 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7159 ++ *
7160 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7161 ++ * Paolo Valente <paolo.valente@×××××××.it>
7162 ++ *
7163 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7164 ++ *
7165 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
7166 ++ *
7167 ++ * BFQ is a proportional share disk scheduling algorithm based on the
7168 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
7169 ++ * number of sectors, to tasks instead of time slices. The disk is not granted
7170 ++ * to the in-service task for a given time slice, but until it has exhausted
7171 ++ * its assigned budget. This change from the time to the service domain allows
7172 ++ * BFQ to distribute the disk bandwidth among tasks as desired, without any
7173 ++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
7174 ++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
7175 ++ * their budgets (more precisely BFQ schedules queues associated to tasks).
7176 ++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
7177 ++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
7178 ++ * latencies to interactive and soft real-time applications.
7179 ++ *
7180 ++ * BFQ is described in [1], where also a reference to the initial, more
7181 ++ * theoretical paper on BFQ can be found. The interested reader can find in
7182 ++ * the latter paper full details on the main algorithm as well as formulas of
7183 ++ * the guarantees, plus formal proofs of all the properties. With respect to
7184 ++ * the version of BFQ presented in these papers, this implementation adds a
7185 ++ * few more heuristics, such as the one that guarantees a low latency to soft
7186 ++ * real-time applications, and a hierarchical extension based on H-WF2Q+.
7187 ++ *
7188 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
7189 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
7190 ++ * complexity derives from the one introduced with EEVDF in [3].
7191 ++ *
7192 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
7193 ++ * with the BFQ Disk I/O Scheduler'',
7194 ++ * Proceedings of the 5th Annual International Systems and Storage
7195 ++ * Conference (SYSTOR '12), June 2012.
7196 ++ *
7197 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
7198 ++ *
7199 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
7200 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
7201 ++ * Oct 1997.
7202 ++ *
7203 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
7204 ++ *
7205 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
7206 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
7207 ++ * Resource Allocation,'' technical report.
7208 ++ *
7209 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
7210 ++ */
7211 ++#include <linux/module.h>
7212 ++#include <linux/slab.h>
7213 ++#include <linux/blkdev.h>
7214 ++#include <linux/cgroup.h>
7215 ++#include <linux/elevator.h>
7216 ++#include <linux/jiffies.h>
7217 ++#include <linux/rbtree.h>
7218 ++#include <linux/ioprio.h>
7219 ++#include "bfq.h"
7220 ++#include "blk.h"
7221 ++
7222 ++/* Max number of dispatches in one round of service. */
7223 ++static const int bfq_quantum = 4;
7224 ++
7225 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
7226 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
7227 ++
7228 ++/* Maximum backwards seek, in KiB. */
7229 ++static const int bfq_back_max = 16 * 1024;
7230 ++
7231 ++/* Penalty of a backwards seek, in number of sectors. */
7232 ++static const int bfq_back_penalty = 2;
7233 ++
7234 ++/* Idling period duration, in jiffies. */
7235 ++static int bfq_slice_idle = HZ / 125;
7236 ++
7237 ++/* Default maximum budget values, in sectors and number of requests. */
7238 ++static const int bfq_default_max_budget = 16 * 1024;
7239 ++static const int bfq_max_budget_async_rq = 4;
7240 ++
7241 ++/*
7242 ++ * Async to sync throughput distribution is controlled as follows:
7243 ++ * when an async request is served, the entity is charged the number
7244 ++ * of sectors of the request, multiplied by the factor below
7245 ++ */
7246 ++static const int bfq_async_charge_factor = 10;
7247 ++
7248 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
7249 ++static const int bfq_timeout_sync = HZ / 8;
7250 ++static int bfq_timeout_async = HZ / 25;
7251 ++
7252 ++struct kmem_cache *bfq_pool;
7253 ++
7254 ++/* Below this threshold (in ms), we consider thinktime immediate. */
7255 ++#define BFQ_MIN_TT 2
7256 ++
7257 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
7258 ++#define BFQ_HW_QUEUE_THRESHOLD 4
7259 ++#define BFQ_HW_QUEUE_SAMPLES 32
7260 ++
7261 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
7262 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
7263 ++
7264 ++/* Min samples used for peak rate estimation (for autotuning). */
7265 ++#define BFQ_PEAK_RATE_SAMPLES 32
7266 ++
7267 ++/* Shift used for peak rate fixed precision calculations. */
7268 ++#define BFQ_RATE_SHIFT 16
7269 ++
7270 ++/*
7271 ++ * The duration of the weight raising for interactive applications is
7272 ++ * computed automatically (as default behaviour), using the following
7273 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
7274 ++ * disk, and R and T are two reference parameters. In particular, R is
7275 ++ * the peak rate of a reference disk, and T is about the maximum time
7276 ++ * for starting popular large applications on that disk, under BFQ and
7277 ++ * while reading two files in parallel. Finally, BFQ uses two
7278 ++ * different pairs (R, T) depending on whether the disk is rotational
7279 ++ * or non-rotational.
7280 ++ */
7281 ++#define T_rot (msecs_to_jiffies(5500))
7282 ++#define T_nonrot (msecs_to_jiffies(2000))
7283 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
7284 ++#define R_rot 17415
7285 ++#define R_nonrot 34791
7286 ++
7287 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
7288 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
7289 ++
7290 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
7291 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
7292 ++
7293 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
7294 ++
7295 ++#include "bfq-ioc.c"
7296 ++#include "bfq-sched.c"
7297 ++#include "bfq-cgroup.c"
7298 ++
7299 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
7300 ++ IOPRIO_CLASS_IDLE)
7301 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
7302 ++ IOPRIO_CLASS_RT)
7303 ++
7304 ++#define bfq_sample_valid(samples) ((samples) > 80)
7305 ++
7306 ++/*
7307 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
7308 ++ * set (in which case it could also be a direct WRITE).
7309 ++ */
7310 ++static inline int bfq_bio_sync(struct bio *bio)
7311 ++{
7312 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
7313 ++ return 1;
7314 ++
7315 ++ return 0;
7316 ++}
7317 ++
7318 ++/*
7319 ++ * Scheduler run of queue, if there are requests pending and no one in the
7320 ++ * driver that will restart queueing.
7321 ++ */
7322 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
7323 ++{
7324 ++ if (bfqd->queued != 0) {
7325 ++ bfq_log(bfqd, "schedule dispatch");
7326 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
7327 ++ }
7328 ++}
7329 ++
7330 ++/*
7331 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
7332 ++ * We choose the request that is closesr to the head right now. Distance
7333 ++ * behind the head is penalized and only allowed to a certain extent.
7334 ++ */
7335 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
7336 ++ struct request *rq1,
7337 ++ struct request *rq2,
7338 ++ sector_t last)
7339 ++{
7340 ++ sector_t s1, s2, d1 = 0, d2 = 0;
7341 ++ unsigned long back_max;
7342 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
7343 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
7344 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
7345 ++
7346 ++ if (rq1 == NULL || rq1 == rq2)
7347 ++ return rq2;
7348 ++ if (rq2 == NULL)
7349 ++ return rq1;
7350 ++
7351 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
7352 ++ return rq1;
7353 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
7354 ++ return rq2;
7355 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
7356 ++ return rq1;
7357 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
7358 ++ return rq2;
7359 ++
7360 ++ s1 = blk_rq_pos(rq1);
7361 ++ s2 = blk_rq_pos(rq2);
7362 ++
7363 ++ /*
7364 ++ * By definition, 1KiB is 2 sectors.
7365 ++ */
7366 ++ back_max = bfqd->bfq_back_max * 2;
7367 ++
7368 ++ /*
7369 ++ * Strict one way elevator _except_ in the case where we allow
7370 ++ * short backward seeks which are biased as twice the cost of a
7371 ++ * similar forward seek.
7372 ++ */
7373 ++ if (s1 >= last)
7374 ++ d1 = s1 - last;
7375 ++ else if (s1 + back_max >= last)
7376 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
7377 ++ else
7378 ++ wrap |= BFQ_RQ1_WRAP;
7379 ++
7380 ++ if (s2 >= last)
7381 ++ d2 = s2 - last;
7382 ++ else if (s2 + back_max >= last)
7383 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
7384 ++ else
7385 ++ wrap |= BFQ_RQ2_WRAP;
7386 ++
7387 ++ /* Found required data */
7388 ++
7389 ++ /*
7390 ++ * By doing switch() on the bit mask "wrap" we avoid having to
7391 ++ * check two variables for all permutations: --> faster!
7392 ++ */
7393 ++ switch (wrap) {
7394 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
7395 ++ if (d1 < d2)
7396 ++ return rq1;
7397 ++ else if (d2 < d1)
7398 ++ return rq2;
7399 ++ else {
7400 ++ if (s1 >= s2)
7401 ++ return rq1;
7402 ++ else
7403 ++ return rq2;
7404 ++ }
7405 ++
7406 ++ case BFQ_RQ2_WRAP:
7407 ++ return rq1;
7408 ++ case BFQ_RQ1_WRAP:
7409 ++ return rq2;
7410 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
7411 ++ default:
7412 ++ /*
7413 ++ * Since both rqs are wrapped,
7414 ++ * start with the one that's further behind head
7415 ++ * (--> only *one* back seek required),
7416 ++ * since back seek takes more time than forward.
7417 ++ */
7418 ++ if (s1 <= s2)
7419 ++ return rq1;
7420 ++ else
7421 ++ return rq2;
7422 ++ }
7423 ++}
7424 ++
7425 ++static struct bfq_queue *
7426 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
7427 ++ sector_t sector, struct rb_node **ret_parent,
7428 ++ struct rb_node ***rb_link)
7429 ++{
7430 ++ struct rb_node **p, *parent;
7431 ++ struct bfq_queue *bfqq = NULL;
7432 ++
7433 ++ parent = NULL;
7434 ++ p = &root->rb_node;
7435 ++ while (*p) {
7436 ++ struct rb_node **n;
7437 ++
7438 ++ parent = *p;
7439 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
7440 ++
7441 ++ /*
7442 ++ * Sort strictly based on sector. Smallest to the left,
7443 ++ * largest to the right.
7444 ++ */
7445 ++ if (sector > blk_rq_pos(bfqq->next_rq))
7446 ++ n = &(*p)->rb_right;
7447 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
7448 ++ n = &(*p)->rb_left;
7449 ++ else
7450 ++ break;
7451 ++ p = n;
7452 ++ bfqq = NULL;
7453 ++ }
7454 ++
7455 ++ *ret_parent = parent;
7456 ++ if (rb_link)
7457 ++ *rb_link = p;
7458 ++
7459 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
7460 ++ (long long unsigned)sector,
7461 ++ bfqq != NULL ? bfqq->pid : 0);
7462 ++
7463 ++ return bfqq;
7464 ++}
7465 ++
7466 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7467 ++{
7468 ++ struct rb_node **p, *parent;
7469 ++ struct bfq_queue *__bfqq;
7470 ++
7471 ++ if (bfqq->pos_root != NULL) {
7472 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7473 ++ bfqq->pos_root = NULL;
7474 ++ }
7475 ++
7476 ++ if (bfq_class_idle(bfqq))
7477 ++ return;
7478 ++ if (!bfqq->next_rq)
7479 ++ return;
7480 ++
7481 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
7482 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
7483 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
7484 ++ if (__bfqq == NULL) {
7485 ++ rb_link_node(&bfqq->pos_node, parent, p);
7486 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
7487 ++ } else
7488 ++ bfqq->pos_root = NULL;
7489 ++}
7490 ++
7491 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
7492 ++ struct bfq_queue *bfqq,
7493 ++ struct request *last)
7494 ++{
7495 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
7496 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
7497 ++ struct request *next = NULL, *prev = NULL;
7498 ++
7499 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
7500 ++
7501 ++ if (rbprev != NULL)
7502 ++ prev = rb_entry_rq(rbprev);
7503 ++
7504 ++ if (rbnext != NULL)
7505 ++ next = rb_entry_rq(rbnext);
7506 ++ else {
7507 ++ rbnext = rb_first(&bfqq->sort_list);
7508 ++ if (rbnext && rbnext != &last->rb_node)
7509 ++ next = rb_entry_rq(rbnext);
7510 ++ }
7511 ++
7512 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
7513 ++}
7514 ++
7515 ++static void bfq_del_rq_rb(struct request *rq)
7516 ++{
7517 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7518 ++ struct bfq_data *bfqd = bfqq->bfqd;
7519 ++ const int sync = rq_is_sync(rq);
7520 ++
7521 ++ BUG_ON(bfqq->queued[sync] == 0);
7522 ++ bfqq->queued[sync]--;
7523 ++ bfqd->queued--;
7524 ++
7525 ++ elv_rb_del(&bfqq->sort_list, rq);
7526 ++
7527 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
7528 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
7529 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
7530 ++ /*
7531 ++ * Remove queue from request-position tree as it is empty.
7532 ++ */
7533 ++ if (bfqq->pos_root != NULL) {
7534 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7535 ++ bfqq->pos_root = NULL;
7536 ++ }
7537 ++ }
7538 ++}
7539 ++
7540 ++/* see the definition of bfq_async_charge_factor for details */
7541 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
7542 ++ struct bfq_queue *bfqq)
7543 ++{
7544 ++ return blk_rq_sectors(rq) *
7545 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
7546 ++ bfq_async_charge_factor));
7547 ++}
7548 ++
7549 ++/**
7550 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
7551 ++ * @bfqd: the device data the queue belongs to.
7552 ++ * @bfqq: the queue to update.
7553 ++ *
7554 ++ * If the first request of a queue changes we make sure that the queue
7555 ++ * has enough budget to serve at least its first request (if the
7556 ++ * request has grown). We do this because if the queue has not enough
7557 ++ * budget for its first request, it has to go through two dispatch
7558 ++ * rounds to actually get it dispatched.
7559 ++ */
7560 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
7561 ++ struct bfq_queue *bfqq)
7562 ++{
7563 ++ struct bfq_entity *entity = &bfqq->entity;
7564 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
7565 ++ struct request *next_rq = bfqq->next_rq;
7566 ++ unsigned long new_budget;
7567 ++
7568 ++ if (next_rq == NULL)
7569 ++ return;
7570 ++
7571 ++ if (bfqq == bfqd->in_service_queue)
7572 ++ /*
7573 ++ * In order not to break guarantees, budgets cannot be
7574 ++ * changed after an entity has been selected.
7575 ++ */
7576 ++ return;
7577 ++
7578 ++ BUG_ON(entity->tree != &st->active);
7579 ++ BUG_ON(entity == entity->sched_data->in_service_entity);
7580 ++
7581 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
7582 ++ bfq_serv_to_charge(next_rq, bfqq));
7583 ++ entity->budget = new_budget;
7584 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
7585 ++ bfq_activate_bfqq(bfqd, bfqq);
7586 ++}
7587 ++
7588 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
7589 ++{
7590 ++ u64 dur;
7591 ++
7592 ++ if (bfqd->bfq_raising_max_time > 0)
7593 ++ return bfqd->bfq_raising_max_time;
7594 ++
7595 ++ dur = bfqd->RT_prod;
7596 ++ do_div(dur, bfqd->peak_rate);
7597 ++
7598 ++ return dur;
7599 ++}
7600 ++
7601 ++static void bfq_add_rq_rb(struct request *rq)
7602 ++{
7603 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7604 ++ struct bfq_entity *entity = &bfqq->entity;
7605 ++ struct bfq_data *bfqd = bfqq->bfqd;
7606 ++ struct request *next_rq, *prev;
7607 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
7608 ++ int idle_for_long_time = 0;
7609 ++
7610 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
7611 ++ bfqq->queued[rq_is_sync(rq)]++;
7612 ++ bfqd->queued++;
7613 ++
7614 ++ elv_rb_add(&bfqq->sort_list, rq);
7615 ++
7616 ++ /*
7617 ++ * Check if this request is a better next-serve candidate.
7618 ++ */
7619 ++ prev = bfqq->next_rq;
7620 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
7621 ++ BUG_ON(next_rq == NULL);
7622 ++ bfqq->next_rq = next_rq;
7623 ++
7624 ++ /*
7625 ++ * Adjust priority tree position, if next_rq changes.
7626 ++ */
7627 ++ if (prev != bfqq->next_rq)
7628 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
7629 ++
7630 ++ if (!bfq_bfqq_busy(bfqq)) {
7631 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
7632 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
7633 ++ idle_for_long_time = time_is_before_jiffies(
7634 ++ bfqq->budget_timeout +
7635 ++ bfqd->bfq_raising_min_idle_time);
7636 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
7637 ++ bfq_serv_to_charge(next_rq, bfqq));
7638 ++
7639 ++ if (!bfqd->low_latency)
7640 ++ goto add_bfqq_busy;
7641 ++
7642 ++ /*
7643 ++ * If the queue is not being boosted and has been idle
7644 ++ * for enough time, start a weight-raising period
7645 ++ */
7646 ++ if (old_raising_coeff == 1 &&
7647 ++ (idle_for_long_time || soft_rt)) {
7648 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7649 ++ if (idle_for_long_time)
7650 ++ bfqq->raising_cur_max_time =
7651 ++ bfq_wrais_duration(bfqd);
7652 ++ else
7653 ++ bfqq->raising_cur_max_time =
7654 ++ bfqd->bfq_raising_rt_max_time;
7655 ++ bfq_log_bfqq(bfqd, bfqq,
7656 ++ "wrais starting at %lu, "
7657 ++ "rais_max_time %u",
7658 ++ jiffies,
7659 ++ jiffies_to_msecs(bfqq->
7660 ++ raising_cur_max_time));
7661 ++ } else if (old_raising_coeff > 1) {
7662 ++ if (idle_for_long_time)
7663 ++ bfqq->raising_cur_max_time =
7664 ++ bfq_wrais_duration(bfqd);
7665 ++ else if (bfqq->raising_cur_max_time ==
7666 ++ bfqd->bfq_raising_rt_max_time &&
7667 ++ !soft_rt) {
7668 ++ bfqq->raising_coeff = 1;
7669 ++ bfq_log_bfqq(bfqd, bfqq,
7670 ++ "wrais ending at %lu, "
7671 ++ "rais_max_time %u",
7672 ++ jiffies,
7673 ++ jiffies_to_msecs(bfqq->
7674 ++ raising_cur_max_time));
7675 ++ } else if (time_before(
7676 ++ bfqq->last_rais_start_finish +
7677 ++ bfqq->raising_cur_max_time,
7678 ++ jiffies +
7679 ++ bfqd->bfq_raising_rt_max_time) &&
7680 ++ soft_rt) {
7681 ++ /*
7682 ++ *
7683 ++ * The remaining weight-raising time is lower
7684 ++ * than bfqd->bfq_raising_rt_max_time, which
7685 ++ * means that the application is enjoying
7686 ++ * weight raising either because deemed soft-
7687 ++ * rt in the near past, or because deemed
7688 ++ * interactive a long ago. In both cases,
7689 ++ * resetting now the current remaining weight-
7690 ++ * raising time for the application to the
7691 ++ * weight-raising duration for soft rt
7692 ++ * applications would not cause any latency
7693 ++ * increase for the application (as the new
7694 ++ * duration would be higher than the remaining
7695 ++ * time).
7696 ++ *
7697 ++ * In addition, the application is now meeting
7698 ++ * the requirements for being deemed soft rt.
7699 ++ * In the end we can correctly and safely
7700 ++ * (re)charge the weight-raising duration for
7701 ++ * the application with the weight-raising
7702 ++ * duration for soft rt applications.
7703 ++ *
7704 ++ * In particular, doing this recharge now, i.e.,
7705 ++ * before the weight-raising period for the
7706 ++ * application finishes, reduces the probability
7707 ++ * of the following negative scenario:
7708 ++ * 1) the weight of a soft rt application is
7709 ++ * raised at startup (as for any newly
7710 ++ * created application),
7711 ++ * 2) since the application is not interactive,
7712 ++ * at a certain time weight-raising is
7713 ++ * stopped for the application,
7714 ++ * 3) at that time the application happens to
7715 ++ * still have pending requests, and hence
7716 ++ * is destined to not have a chance to be
7717 ++ * deemed soft rt before these requests are
7718 ++ * completed (see the comments to the
7719 ++ * function bfq_bfqq_softrt_next_start()
7720 ++ * for details on soft rt detection),
7721 ++ * 4) these pending requests experience a high
7722 ++ * latency because the application is not
7723 ++ * weight-raised while they are pending.
7724 ++ */
7725 ++ bfqq->last_rais_start_finish = jiffies;
7726 ++ bfqq->raising_cur_max_time =
7727 ++ bfqd->bfq_raising_rt_max_time;
7728 ++ }
7729 ++ }
7730 ++ if (old_raising_coeff != bfqq->raising_coeff)
7731 ++ entity->ioprio_changed = 1;
7732 ++add_bfqq_busy:
7733 ++ bfqq->last_idle_bklogged = jiffies;
7734 ++ bfqq->service_from_backlogged = 0;
7735 ++ bfq_clear_bfqq_softrt_update(bfqq);
7736 ++ bfq_add_bfqq_busy(bfqd, bfqq);
7737 ++ } else {
7738 ++ if (bfqd->low_latency && old_raising_coeff == 1 &&
7739 ++ !rq_is_sync(rq) &&
7740 ++ time_is_before_jiffies(
7741 ++ bfqq->last_rais_start_finish +
7742 ++ bfqd->bfq_raising_min_inter_arr_async)) {
7743 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7744 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
7745 ++
7746 ++ bfqd->raised_busy_queues++;
7747 ++ entity->ioprio_changed = 1;
7748 ++ bfq_log_bfqq(bfqd, bfqq,
7749 ++ "non-idle wrais starting at %lu, "
7750 ++ "rais_max_time %u",
7751 ++ jiffies,
7752 ++ jiffies_to_msecs(bfqq->
7753 ++ raising_cur_max_time));
7754 ++ }
7755 ++ bfq_updated_next_req(bfqd, bfqq);
7756 ++ }
7757 ++
7758 ++ if (bfqd->low_latency &&
7759 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
7760 ++ idle_for_long_time))
7761 ++ bfqq->last_rais_start_finish = jiffies;
7762 ++}
7763 ++
7764 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
7765 ++{
7766 ++ elv_rb_del(&bfqq->sort_list, rq);
7767 ++ bfqq->queued[rq_is_sync(rq)]--;
7768 ++ bfqq->bfqd->queued--;
7769 ++ bfq_add_rq_rb(rq);
7770 ++}
7771 ++
7772 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
7773 ++ struct bio *bio)
7774 ++{
7775 ++ struct task_struct *tsk = current;
7776 ++ struct bfq_io_cq *bic;
7777 ++ struct bfq_queue *bfqq;
7778 ++
7779 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
7780 ++ if (bic == NULL)
7781 ++ return NULL;
7782 ++
7783 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7784 ++ if (bfqq != NULL)
7785 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
7786 ++
7787 ++ return NULL;
7788 ++}
7789 ++
7790 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
7791 ++{
7792 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7793 ++
7794 ++ bfqd->rq_in_driver++;
7795 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
7796 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
7797 ++ (long long unsigned)bfqd->last_position);
7798 ++}
7799 ++
7800 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
7801 ++{
7802 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7803 ++
7804 ++ WARN_ON(bfqd->rq_in_driver == 0);
7805 ++ bfqd->rq_in_driver--;
7806 ++}
7807 ++
7808 ++static void bfq_remove_request(struct request *rq)
7809 ++{
7810 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7811 ++ struct bfq_data *bfqd = bfqq->bfqd;
7812 ++
7813 ++ if (bfqq->next_rq == rq) {
7814 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
7815 ++ bfq_updated_next_req(bfqd, bfqq);
7816 ++ }
7817 ++
7818 ++ list_del_init(&rq->queuelist);
7819 ++ bfq_del_rq_rb(rq);
7820 ++
7821 ++ if (rq->cmd_flags & REQ_META) {
7822 ++ WARN_ON(bfqq->meta_pending == 0);
7823 ++ bfqq->meta_pending--;
7824 ++ }
7825 ++}
7826 ++
7827 ++static int bfq_merge(struct request_queue *q, struct request **req,
7828 ++ struct bio *bio)
7829 ++{
7830 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7831 ++ struct request *__rq;
7832 ++
7833 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
7834 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
7835 ++ *req = __rq;
7836 ++ return ELEVATOR_FRONT_MERGE;
7837 ++ }
7838 ++
7839 ++ return ELEVATOR_NO_MERGE;
7840 ++}
7841 ++
7842 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
7843 ++ int type)
7844 ++{
7845 ++ if (type == ELEVATOR_FRONT_MERGE) {
7846 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
7847 ++
7848 ++ bfq_reposition_rq_rb(bfqq, req);
7849 ++ }
7850 ++}
7851 ++
7852 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
7853 ++ struct request *next)
7854 ++{
7855 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7856 ++
7857 ++ /*
7858 ++ * Reposition in fifo if next is older than rq.
7859 ++ */
7860 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
7861 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
7862 ++ list_move(&rq->queuelist, &next->queuelist);
7863 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
7864 ++ }
7865 ++
7866 ++ if (bfqq->next_rq == next)
7867 ++ bfqq->next_rq = rq;
7868 ++
7869 ++ bfq_remove_request(next);
7870 ++}
7871 ++
7872 ++/* Must be called with bfqq != NULL */
7873 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
7874 ++{
7875 ++ BUG_ON(bfqq == NULL);
7876 ++ if (bfq_bfqq_busy(bfqq))
7877 ++ bfqq->bfqd->raised_busy_queues--;
7878 ++ bfqq->raising_coeff = 1;
7879 ++ bfqq->raising_cur_max_time = 0;
7880 ++ /* Trigger a weight change on the next activation of the queue */
7881 ++ bfqq->entity.ioprio_changed = 1;
7882 ++}
7883 ++
7884 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
7885 ++ struct bfq_group *bfqg)
7886 ++{
7887 ++ int i, j;
7888 ++
7889 ++ for (i = 0; i < 2; i++)
7890 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
7891 ++ if (bfqg->async_bfqq[i][j] != NULL)
7892 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
7893 ++ if (bfqg->async_idle_bfqq != NULL)
7894 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
7895 ++}
7896 ++
7897 ++static void bfq_end_raising(struct bfq_data *bfqd)
7898 ++{
7899 ++ struct bfq_queue *bfqq;
7900 ++
7901 ++ spin_lock_irq(bfqd->queue->queue_lock);
7902 ++
7903 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
7904 ++ bfq_bfqq_end_raising(bfqq);
7905 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
7906 ++ bfq_bfqq_end_raising(bfqq);
7907 ++ bfq_end_raising_async(bfqd);
7908 ++
7909 ++ spin_unlock_irq(bfqd->queue->queue_lock);
7910 ++}
7911 ++
7912 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
7913 ++ struct bio *bio)
7914 ++{
7915 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7916 ++ struct bfq_io_cq *bic;
7917 ++ struct bfq_queue *bfqq;
7918 ++
7919 ++ /*
7920 ++ * Disallow merge of a sync bio into an async request.
7921 ++ */
7922 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
7923 ++ return 0;
7924 ++
7925 ++ /*
7926 ++ * Lookup the bfqq that this bio will be queued with. Allow
7927 ++ * merge only if rq is queued there.
7928 ++ * Queue lock is held here.
7929 ++ */
7930 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
7931 ++ if (bic == NULL)
7932 ++ return 0;
7933 ++
7934 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7935 ++ return bfqq == RQ_BFQQ(rq);
7936 ++}
7937 ++
7938 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
7939 ++ struct bfq_queue *bfqq)
7940 ++{
7941 ++ if (bfqq != NULL) {
7942 ++ bfq_mark_bfqq_must_alloc(bfqq);
7943 ++ bfq_mark_bfqq_budget_new(bfqq);
7944 ++ bfq_clear_bfqq_fifo_expire(bfqq);
7945 ++
7946 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
7947 ++
7948 ++ bfq_log_bfqq(bfqd, bfqq,
7949 ++ "set_in_service_queue, cur-budget = %lu",
7950 ++ bfqq->entity.budget);
7951 ++ }
7952 ++
7953 ++ bfqd->in_service_queue = bfqq;
7954 ++}
7955 ++
7956 ++/*
7957 ++ * Get and set a new queue for service.
7958 ++ */
7959 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
7960 ++ struct bfq_queue *bfqq)
7961 ++{
7962 ++ if (!bfqq)
7963 ++ bfqq = bfq_get_next_queue(bfqd);
7964 ++ else
7965 ++ bfq_get_next_queue_forced(bfqd, bfqq);
7966 ++
7967 ++ __bfq_set_in_service_queue(bfqd, bfqq);
7968 ++ return bfqq;
7969 ++}
7970 ++
7971 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
7972 ++ struct request *rq)
7973 ++{
7974 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
7975 ++ return blk_rq_pos(rq) - bfqd->last_position;
7976 ++ else
7977 ++ return bfqd->last_position - blk_rq_pos(rq);
7978 ++}
7979 ++
7980 ++/*
7981 ++ * Return true if bfqq has no request pending and rq is close enough to
7982 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
7983 ++ * bfqq->next_rq
7984 ++ */
7985 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
7986 ++{
7987 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
7988 ++}
7989 ++
7990 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
7991 ++{
7992 ++ struct rb_root *root = &bfqd->rq_pos_tree;
7993 ++ struct rb_node *parent, *node;
7994 ++ struct bfq_queue *__bfqq;
7995 ++ sector_t sector = bfqd->last_position;
7996 ++
7997 ++ if (RB_EMPTY_ROOT(root))
7998 ++ return NULL;
7999 ++
8000 ++ /*
8001 ++ * First, if we find a request starting at the end of the last
8002 ++ * request, choose it.
8003 ++ */
8004 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
8005 ++ if (__bfqq != NULL)
8006 ++ return __bfqq;
8007 ++
8008 ++ /*
8009 ++ * If the exact sector wasn't found, the parent of the NULL leaf
8010 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
8011 ++ * position).
8012 ++ */
8013 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8014 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8015 ++ return __bfqq;
8016 ++
8017 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
8018 ++ node = rb_next(&__bfqq->pos_node);
8019 ++ else
8020 ++ node = rb_prev(&__bfqq->pos_node);
8021 ++ if (node == NULL)
8022 ++ return NULL;
8023 ++
8024 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
8025 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8026 ++ return __bfqq;
8027 ++
8028 ++ return NULL;
8029 ++}
8030 ++
8031 ++/*
8032 ++ * bfqd - obvious
8033 ++ * cur_bfqq - passed in so that we don't decide that the current queue
8034 ++ * is closely cooperating with itself.
8035 ++ *
8036 ++ * We are assuming that cur_bfqq has dispatched at least one request,
8037 ++ * and that bfqd->last_position reflects a position on the disk associated
8038 ++ * with the I/O issued by cur_bfqq.
8039 ++ */
8040 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
8041 ++ struct bfq_queue *cur_bfqq)
8042 ++{
8043 ++ struct bfq_queue *bfqq;
8044 ++
8045 ++ if (bfq_class_idle(cur_bfqq))
8046 ++ return NULL;
8047 ++ if (!bfq_bfqq_sync(cur_bfqq))
8048 ++ return NULL;
8049 ++ if (BFQQ_SEEKY(cur_bfqq))
8050 ++ return NULL;
8051 ++
8052 ++ /* If device has only one backlogged bfq_queue, don't search. */
8053 ++ if (bfqd->busy_queues == 1)
8054 ++ return NULL;
8055 ++
8056 ++ /*
8057 ++ * We should notice if some of the queues are cooperating, e.g.
8058 ++ * working closely on the same area of the disk. In that case,
8059 ++ * we can group them together and don't waste time idling.
8060 ++ */
8061 ++ bfqq = bfqq_close(bfqd);
8062 ++ if (bfqq == NULL || bfqq == cur_bfqq)
8063 ++ return NULL;
8064 ++
8065 ++ /*
8066 ++ * Do not merge queues from different bfq_groups.
8067 ++ */
8068 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
8069 ++ return NULL;
8070 ++
8071 ++ /*
8072 ++ * It only makes sense to merge sync queues.
8073 ++ */
8074 ++ if (!bfq_bfqq_sync(bfqq))
8075 ++ return NULL;
8076 ++ if (BFQQ_SEEKY(bfqq))
8077 ++ return NULL;
8078 ++
8079 ++ /*
8080 ++ * Do not merge queues of different priority classes.
8081 ++ */
8082 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
8083 ++ return NULL;
8084 ++
8085 ++ return bfqq;
8086 ++}
8087 ++
8088 ++/*
8089 ++ * If enough samples have been computed, return the current max budget
8090 ++ * stored in bfqd, which is dynamically updated according to the
8091 ++ * estimated disk peak rate; otherwise return the default max budget
8092 ++ */
8093 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
8094 ++{
8095 ++ if (bfqd->budgets_assigned < 194)
8096 ++ return bfq_default_max_budget;
8097 ++ else
8098 ++ return bfqd->bfq_max_budget;
8099 ++}
8100 ++
8101 ++/*
8102 ++ * Return min budget, which is a fraction of the current or default
8103 ++ * max budget (trying with 1/32)
8104 ++ */
8105 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
8106 ++{
8107 ++ if (bfqd->budgets_assigned < 194)
8108 ++ return bfq_default_max_budget / 32;
8109 ++ else
8110 ++ return bfqd->bfq_max_budget / 32;
8111 ++}
8112 ++
8113 ++/*
8114 ++ * Decides whether idling should be done for given device and
8115 ++ * given in-service queue.
8116 ++ */
8117 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
8118 ++ struct bfq_queue *in_service_bfqq)
8119 ++{
8120 ++ if (in_service_bfqq == NULL)
8121 ++ return false;
8122 ++ /*
8123 ++ * If the device is non-rotational, and hence has no seek penalty,
8124 ++ * disable idling; but do so only if:
8125 ++ * - device does not support queuing, otherwise we still have
8126 ++ * a problem with sync vs async workloads;
8127 ++ * - the queue is not weight-raised, to preserve guarantees.
8128 ++ */
8129 ++ return blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
8130 ++ (in_service_bfqq->raising_coeff == 1);
8131 ++}
8132 ++
8133 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
8134 ++{
8135 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
8136 ++ struct bfq_io_cq *bic;
8137 ++ unsigned long sl;
8138 ++
8139 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
8140 ++
8141 ++ /* Tasks have exited, don't wait. */
8142 ++ bic = bfqd->in_service_bic;
8143 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
8144 ++ return;
8145 ++
8146 ++ bfq_mark_bfqq_wait_request(bfqq);
8147 ++
8148 ++ /*
8149 ++ * We don't want to idle for seeks, but we do want to allow
8150 ++ * fair distribution of slice time for a process doing back-to-back
8151 ++ * seeks. So allow a little bit of time for him to submit a new rq.
8152 ++ *
8153 ++ * To prevent processes with (partly) seeky workloads from
8154 ++ * being too ill-treated, grant them a small fraction of the
8155 ++ * assigned budget before reducing the waiting time to
8156 ++ * BFQ_MIN_TT. This happened to help reduce latency.
8157 ++ */
8158 ++ sl = bfqd->bfq_slice_idle;
8159 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
8160 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
8161 ++ bfqq->raising_coeff == 1)
8162 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
8163 ++ else if (bfqq->raising_coeff > 1)
8164 ++ sl = sl * 3;
8165 ++ bfqd->last_idling_start = ktime_get();
8166 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
8167 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
8168 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
8169 ++}
8170 ++
8171 ++/*
8172 ++ * Set the maximum time for the in-service queue to consume its
8173 ++ * budget. This prevents seeky processes from lowering the disk
8174 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
8175 ++ */
8176 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
8177 ++{
8178 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
8179 ++ unsigned int timeout_coeff;
8180 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
8181 ++ timeout_coeff = 1;
8182 ++ else
8183 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
8184 ++
8185 ++ bfqd->last_budget_start = ktime_get();
8186 ++
8187 ++ bfq_clear_bfqq_budget_new(bfqq);
8188 ++ bfqq->budget_timeout = jiffies +
8189 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
8190 ++
8191 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
8192 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
8193 ++ timeout_coeff));
8194 ++}
8195 ++
8196 ++/*
8197 ++ * Move request from internal lists to the request queue dispatch list.
8198 ++ */
8199 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
8200 ++{
8201 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8202 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8203 ++
8204 ++ bfq_remove_request(rq);
8205 ++ bfqq->dispatched++;
8206 ++ elv_dispatch_sort(q, rq);
8207 ++
8208 ++ if (bfq_bfqq_sync(bfqq))
8209 ++ bfqd->sync_flight++;
8210 ++}
8211 ++
8212 ++/*
8213 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
8214 ++ */
8215 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
8216 ++{
8217 ++ struct request *rq = NULL;
8218 ++
8219 ++ if (bfq_bfqq_fifo_expire(bfqq))
8220 ++ return NULL;
8221 ++
8222 ++ bfq_mark_bfqq_fifo_expire(bfqq);
8223 ++
8224 ++ if (list_empty(&bfqq->fifo))
8225 ++ return NULL;
8226 ++
8227 ++ rq = rq_entry_fifo(bfqq->fifo.next);
8228 ++
8229 ++ if (time_before(jiffies, rq_fifo_time(rq)))
8230 ++ return NULL;
8231 ++
8232 ++ return rq;
8233 ++}
8234 ++
8235 ++/*
8236 ++ * Must be called with the queue_lock held.
8237 ++ */
8238 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
8239 ++{
8240 ++ int process_refs, io_refs;
8241 ++
8242 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
8243 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
8244 ++ BUG_ON(process_refs < 0);
8245 ++ return process_refs;
8246 ++}
8247 ++
8248 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
8249 ++{
8250 ++ int process_refs, new_process_refs;
8251 ++ struct bfq_queue *__bfqq;
8252 ++
8253 ++ /*
8254 ++ * If there are no process references on the new_bfqq, then it is
8255 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
8256 ++ * may have dropped their last reference (not just their last process
8257 ++ * reference).
8258 ++ */
8259 ++ if (!bfqq_process_refs(new_bfqq))
8260 ++ return;
8261 ++
8262 ++ /* Avoid a circular list and skip interim queue merges. */
8263 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
8264 ++ if (__bfqq == bfqq)
8265 ++ return;
8266 ++ new_bfqq = __bfqq;
8267 ++ }
8268 ++
8269 ++ process_refs = bfqq_process_refs(bfqq);
8270 ++ new_process_refs = bfqq_process_refs(new_bfqq);
8271 ++ /*
8272 ++ * If the process for the bfqq has gone away, there is no
8273 ++ * sense in merging the queues.
8274 ++ */
8275 ++ if (process_refs == 0 || new_process_refs == 0)
8276 ++ return;
8277 ++
8278 ++ /*
8279 ++ * Merge in the direction of the lesser amount of work.
8280 ++ */
8281 ++ if (new_process_refs >= process_refs) {
8282 ++ bfqq->new_bfqq = new_bfqq;
8283 ++ atomic_add(process_refs, &new_bfqq->ref);
8284 ++ } else {
8285 ++ new_bfqq->new_bfqq = bfqq;
8286 ++ atomic_add(new_process_refs, &bfqq->ref);
8287 ++ }
8288 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
8289 ++ new_bfqq->pid);
8290 ++}
8291 ++
8292 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
8293 ++{
8294 ++ struct bfq_entity *entity = &bfqq->entity;
8295 ++ return entity->budget - entity->service;
8296 ++}
8297 ++
8298 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8299 ++{
8300 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8301 ++
8302 ++ __bfq_bfqd_reset_in_service(bfqd);
8303 ++
8304 ++ /*
8305 ++ * If this bfqq is shared between multiple processes, check
8306 ++ * to make sure that those processes are still issuing I/Os
8307 ++ * within the mean seek distance. If not, it may be time to
8308 ++ * break the queues apart again.
8309 ++ */
8310 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
8311 ++ bfq_mark_bfqq_split_coop(bfqq);
8312 ++
8313 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
8314 ++ /*
8315 ++ * overloading budget_timeout field to store when
8316 ++ * the queue remains with no backlog, used by
8317 ++ * the weight-raising mechanism
8318 ++ */
8319 ++ bfqq->budget_timeout = jiffies;
8320 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
8321 ++ } else {
8322 ++ bfq_activate_bfqq(bfqd, bfqq);
8323 ++ /*
8324 ++ * Resort priority tree of potential close cooperators.
8325 ++ */
8326 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
8327 ++ }
8328 ++}
8329 ++
8330 ++/**
8331 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
8332 ++ * @bfqd: device data.
8333 ++ * @bfqq: queue to update.
8334 ++ * @reason: reason for expiration.
8335 ++ *
8336 ++ * Handle the feedback on @bfqq budget. See the body for detailed
8337 ++ * comments.
8338 ++ */
8339 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
8340 ++ struct bfq_queue *bfqq,
8341 ++ enum bfqq_expiration reason)
8342 ++{
8343 ++ struct request *next_rq;
8344 ++ unsigned long budget, min_budget;
8345 ++
8346 ++ budget = bfqq->max_budget;
8347 ++ min_budget = bfq_min_budget(bfqd);
8348 ++
8349 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8350 ++
8351 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
8352 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
8353 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
8354 ++ budget, bfq_min_budget(bfqd));
8355 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
8356 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
8357 ++
8358 ++ if (bfq_bfqq_sync(bfqq)) {
8359 ++ switch (reason) {
8360 ++ /*
8361 ++ * Caveat: in all the following cases we trade latency
8362 ++ * for throughput.
8363 ++ */
8364 ++ case BFQ_BFQQ_TOO_IDLE:
8365 ++ /*
8366 ++ * This is the only case where we may reduce
8367 ++ * the budget: if there is no request of the
8368 ++ * process still waiting for completion, then
8369 ++ * we assume (tentatively) that the timer has
8370 ++ * expired because the batch of requests of
8371 ++ * the process could have been served with a
8372 ++ * smaller budget. Hence, betting that
8373 ++ * process will behave in the same way when it
8374 ++ * becomes backlogged again, we reduce its
8375 ++ * next budget. As long as we guess right,
8376 ++ * this budget cut reduces the latency
8377 ++ * experienced by the process.
8378 ++ *
8379 ++ * However, if there are still outstanding
8380 ++ * requests, then the process may have not yet
8381 ++ * issued its next request just because it is
8382 ++ * still waiting for the completion of some of
8383 ++ * the still outstanding ones. So in this
8384 ++ * subcase we do not reduce its budget, on the
8385 ++ * contrary we increase it to possibly boost
8386 ++ * the throughput, as discussed in the
8387 ++ * comments to the BUDGET_TIMEOUT case.
8388 ++ */
8389 ++ if (bfqq->dispatched > 0) /* still outstanding reqs */
8390 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
8391 ++ else {
8392 ++ if (budget > 5 * min_budget)
8393 ++ budget -= 4 * min_budget;
8394 ++ else
8395 ++ budget = min_budget;
8396 ++ }
8397 ++ break;
8398 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
8399 ++ /*
8400 ++ * We double the budget here because: 1) it
8401 ++ * gives the chance to boost the throughput if
8402 ++ * this is not a seeky process (which may have
8403 ++ * bumped into this timeout because of, e.g.,
8404 ++ * ZBR), 2) together with charge_full_budget
8405 ++ * it helps give seeky processes higher
8406 ++ * timestamps, and hence be served less
8407 ++ * frequently.
8408 ++ */
8409 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
8410 ++ break;
8411 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
8412 ++ /*
8413 ++ * The process still has backlog, and did not
8414 ++ * let either the budget timeout or the disk
8415 ++ * idling timeout expire. Hence it is not
8416 ++ * seeky, has a short thinktime and may be
8417 ++ * happy with a higher budget too. So
8418 ++ * definitely increase the budget of this good
8419 ++ * candidate to boost the disk throughput.
8420 ++ */
8421 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
8422 ++ break;
8423 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
8424 ++ /*
8425 ++ * Leave the budget unchanged.
8426 ++ */
8427 ++ default:
8428 ++ return;
8429 ++ }
8430 ++ } else /* async queue */
8431 ++ /* async queues get always the maximum possible budget
8432 ++ * (their ability to dispatch is limited by
8433 ++ * @bfqd->bfq_max_budget_async_rq).
8434 ++ */
8435 ++ budget = bfqd->bfq_max_budget;
8436 ++
8437 ++ bfqq->max_budget = budget;
8438 ++
8439 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
8440 ++ bfqq->max_budget > bfqd->bfq_max_budget)
8441 ++ bfqq->max_budget = bfqd->bfq_max_budget;
8442 ++
8443 ++ /*
8444 ++ * Make sure that we have enough budget for the next request.
8445 ++ * Since the finish time of the bfqq must be kept in sync with
8446 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
8447 ++ * update.
8448 ++ */
8449 ++ next_rq = bfqq->next_rq;
8450 ++ if (next_rq != NULL)
8451 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
8452 ++ bfq_serv_to_charge(next_rq, bfqq));
8453 ++ else
8454 ++ bfqq->entity.budget = bfqq->max_budget;
8455 ++
8456 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
8457 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
8458 ++ bfqq->entity.budget);
8459 ++}
8460 ++
8461 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
8462 ++{
8463 ++ unsigned long max_budget;
8464 ++
8465 ++ /*
8466 ++ * The max_budget calculated when autotuning is equal to the
8467 ++ * amount of sectors transfered in timeout_sync at the
8468 ++ * estimated peak rate.
8469 ++ */
8470 ++ max_budget = (unsigned long)(peak_rate * 1000 *
8471 ++ timeout >> BFQ_RATE_SHIFT);
8472 ++
8473 ++ return max_budget;
8474 ++}
8475 ++
8476 ++/*
8477 ++ * In addition to updating the peak rate, checks whether the process
8478 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
8479 ++ * to the budget timeout, to reduce the amount of service provided to
8480 ++ * seeky processes, and hence reduce their chances to lower the
8481 ++ * throughput. See the code for more details.
8482 ++ */
8483 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8484 ++ int compensate, enum bfqq_expiration reason)
8485 ++{
8486 ++ u64 bw, usecs, expected, timeout;
8487 ++ ktime_t delta;
8488 ++ int update = 0;
8489 ++
8490 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
8491 ++ return 0;
8492 ++
8493 ++ if (compensate)
8494 ++ delta = bfqd->last_idling_start;
8495 ++ else
8496 ++ delta = ktime_get();
8497 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
8498 ++ usecs = ktime_to_us(delta);
8499 ++
8500 ++ /* Don't trust short/unrealistic values. */
8501 ++ if (usecs < 100 || usecs >= LONG_MAX)
8502 ++ return 0;
8503 ++
8504 ++ /*
8505 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
8506 ++ * value to store the peak rate, in sectors per usec in fixed
8507 ++ * point math. We do so to have enough precision in the estimate
8508 ++ * and to avoid overflows.
8509 ++ */
8510 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
8511 ++ do_div(bw, (unsigned long)usecs);
8512 ++
8513 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
8514 ++
8515 ++ /*
8516 ++ * Use only long (> 20ms) intervals to filter out spikes for
8517 ++ * the peak rate estimation.
8518 ++ */
8519 ++ if (usecs > 20000) {
8520 ++ if (bw > bfqd->peak_rate ||
8521 ++ (!BFQQ_SEEKY(bfqq) &&
8522 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
8523 ++ bfq_log(bfqd, "measured bw =%llu", bw);
8524 ++ /*
8525 ++ * To smooth oscillations use a low-pass filter with
8526 ++ * alpha=7/8, i.e.,
8527 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
8528 ++ */
8529 ++ do_div(bw, 8);
8530 ++ if (bw == 0)
8531 ++ return 0;
8532 ++ bfqd->peak_rate *= 7;
8533 ++ do_div(bfqd->peak_rate, 8);
8534 ++ bfqd->peak_rate += bw;
8535 ++ update = 1;
8536 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
8537 ++ }
8538 ++
8539 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
8540 ++
8541 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
8542 ++ bfqd->peak_rate_samples++;
8543 ++
8544 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
8545 ++ update && bfqd->bfq_user_max_budget == 0) {
8546 ++ bfqd->bfq_max_budget =
8547 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
8548 ++ bfq_log(bfqd, "new max_budget=%lu",
8549 ++ bfqd->bfq_max_budget);
8550 ++ }
8551 ++ }
8552 ++
8553 ++ /*
8554 ++ * If the process has been served for a too short time
8555 ++ * interval to let its possible sequential accesses prevail on
8556 ++ * the initial seek time needed to move the disk head on the
8557 ++ * first sector it requested, then give the process a chance
8558 ++ * and for the moment return false.
8559 ++ */
8560 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
8561 ++ return 0;
8562 ++
8563 ++ /*
8564 ++ * A process is considered ``slow'' (i.e., seeky, so that we
8565 ++ * cannot treat it fairly in the service domain, as it would
8566 ++ * slow down too much the other processes) if, when a slice
8567 ++ * ends for whatever reason, it has received service at a
8568 ++ * rate that would not be high enough to complete the budget
8569 ++ * before the budget timeout expiration.
8570 ++ */
8571 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
8572 ++
8573 ++ /*
8574 ++ * Caveat: processes doing IO in the slower disk zones will
8575 ++ * tend to be slow(er) even if not seeky. And the estimated
8576 ++ * peak rate will actually be an average over the disk
8577 ++ * surface. Hence, to not be too harsh with unlucky processes,
8578 ++ * we keep a budget/3 margin of safety before declaring a
8579 ++ * process slow.
8580 ++ */
8581 ++ return expected > (4 * bfqq->entity.budget) / 3;
8582 ++}
8583 ++
8584 ++/*
8585 ++ * To be deemed as soft real-time, an application must meet two requirements.
8586 ++ * First, the application must not require an average bandwidth higher than
8587 ++ * the approximate bandwidth required to playback or record a compressed high-
8588 ++ * definition video.
8589 ++ * The next function is invoked on the completion of the last request of a
8590 ++ * batch, to compute the next-start time instant, soft_rt_next_start, such
8591 ++ * that, if the next request of the application does not arrive before
8592 ++ * soft_rt_next_start, then the above requirement on the bandwidth is met.
8593 ++ *
8594 ++ * The second requirement is that the request pattern of the application is
8595 ++ * isochronous, i.e., that, after issuing a request or a batch of requests,
8596 ++ * the application stops issuing new requests until all its pending requests
8597 ++ * have been completed. After that, the application may issue a new batch,
8598 ++ * and so on.
8599 ++ * For this reason the next function is invoked to compute soft_rt_next_start
8600 ++ * only for applications that meet this requirement, whereas soft_rt_next_start
8601 ++ * is set to infinity for applications that do not.
8602 ++ *
8603 ++ * Unfortunately, even a greedy application may happen to behave in an
8604 ++ * isochronous way if the CPU load is high. In fact, the application may stop
8605 ++ * issuing requests while the CPUs are busy serving other processes, then
8606 ++ * restart, then stop again for a while, and so on. In addition, if the disk
8607 ++ * achieves a low enough throughput with the request pattern issued by the
8608 ++ * application (e.g., because the request pattern is random and/or the device
8609 ++ * is slow), then the application may meet the above bandwidth requirement too.
8610 ++ * To prevent such a greedy application to be deemed as soft real-time, a
8611 ++ * further rule is used in the computation of soft_rt_next_start:
8612 ++ * soft_rt_next_start must be higher than the current time plus the maximum
8613 ++ * time for which the arrival of a request is waited for when a sync queue
8614 ++ * becomes idle, namely bfqd->bfq_slice_idle.
8615 ++ * This filters out greedy applications, as the latter issue instead their next
8616 ++ * request as soon as possible after the last one has been completed (in
8617 ++ * contrast, when a batch of requests is completed, a soft real-time application
8618 ++ * spends some time processing data).
8619 ++ *
8620 ++ * Unfortunately, the last filter may easily generate false positives if only
8621 ++ * bfqd->bfq_slice_idle is used as a reference time interval and one or both
8622 ++ * the following cases occur:
8623 ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
8624 ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
8625 ++ * HZ=100.
8626 ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
8627 ++ * for a while, then suddenly 'jump' by several units to recover the lost
8628 ++ * increments. This seems to happen, e.g., inside virtual machines.
8629 ++ * To address this issue, we do not use as a reference time interval just
8630 ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
8631 ++ * particular we add the minimum number of jiffies for which the filter seems
8632 ++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
8633 ++ */
8634 ++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
8635 ++ struct bfq_queue *bfqq)
8636 ++{
8637 ++ return max(bfqq->last_idle_bklogged +
8638 ++ HZ * bfqq->service_from_backlogged /
8639 ++ bfqd->bfq_raising_max_softrt_rate,
8640 ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
8641 ++}
8642 ++
8643 ++/*
8644 ++ * Return the largest-possible time instant such that, for as long as possible,
8645 ++ * the current time will be lower than this time instant according to the macro
8646 ++ * time_is_before_jiffies().
8647 ++ */
8648 ++static inline unsigned long bfq_infinity_from_now(unsigned long now)
8649 ++{
8650 ++ return now + ULONG_MAX / 2;
8651 ++}
8652 ++
8653 ++/**
8654 ++ * bfq_bfqq_expire - expire a queue.
8655 ++ * @bfqd: device owning the queue.
8656 ++ * @bfqq: the queue to expire.
8657 ++ * @compensate: if true, compensate for the time spent idling.
8658 ++ * @reason: the reason causing the expiration.
8659 ++ *
8660 ++ *
8661 ++ * If the process associated to the queue is slow (i.e., seeky), or in
8662 ++ * case of budget timeout, or, finally, if it is async, we
8663 ++ * artificially charge it an entire budget (independently of the
8664 ++ * actual service it received). As a consequence, the queue will get
8665 ++ * higher timestamps than the correct ones upon reactivation, and
8666 ++ * hence it will be rescheduled as if it had received more service
8667 ++ * than what it actually received. In the end, this class of processes
8668 ++ * will receive less service in proportion to how slowly they consume
8669 ++ * their budgets (and hence how seriously they tend to lower the
8670 ++ * throughput).
8671 ++ *
8672 ++ * In contrast, when a queue expires because it has been idling for
8673 ++ * too much or because it exhausted its budget, we do not touch the
8674 ++ * amount of service it has received. Hence when the queue will be
8675 ++ * reactivated and its timestamps updated, the latter will be in sync
8676 ++ * with the actual service received by the queue until expiration.
8677 ++ *
8678 ++ * Charging a full budget to the first type of queues and the exact
8679 ++ * service to the others has the effect of using the WF2Q+ policy to
8680 ++ * schedule the former on a timeslice basis, without violating the
8681 ++ * service domain guarantees of the latter.
8682 ++ */
8683 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
8684 ++ struct bfq_queue *bfqq,
8685 ++ int compensate,
8686 ++ enum bfqq_expiration reason)
8687 ++{
8688 ++ int slow;
8689 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8690 ++
8691 ++ /* Update disk peak rate for autotuning and check whether the
8692 ++ * process is slow (see bfq_update_peak_rate).
8693 ++ */
8694 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
8695 ++
8696 ++ /*
8697 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
8698 ++ * and async queues, to favor sequential sync workloads.
8699 ++ *
8700 ++ * Processes doing IO in the slower disk zones will tend to be
8701 ++ * slow(er) even if not seeky. Hence, since the estimated peak
8702 ++ * rate is actually an average over the disk surface, these
8703 ++ * processes may timeout just for bad luck. To avoid punishing
8704 ++ * them we do not charge a full budget to a process that
8705 ++ * succeeded in consuming at least 2/3 of its budget.
8706 ++ */
8707 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
8708 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
8709 ++ bfq_bfqq_charge_full_budget(bfqq);
8710 ++
8711 ++ bfqq->service_from_backlogged += bfqq->entity.service;
8712 ++
8713 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
8714 ++ bfqq->last_rais_start_finish = jiffies;
8715 ++
8716 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&
8717 ++ RB_EMPTY_ROOT(&bfqq->sort_list)) {
8718 ++ /*
8719 ++ * If we get here, and there are no outstanding requests,
8720 ++ * then the request pattern is isochronous (see the comments
8721 ++ * to the function bfq_bfqq_softrt_next_start()). Hence we can
8722 ++ * compute soft_rt_next_start. If, instead, the queue still
8723 ++ * has outstanding requests, then we have to wait for the
8724 ++ * completion of all the outstanding requests to discover
8725 ++ * whether the request pattern is actually isochronous.
8726 ++ */
8727 ++ if (bfqq->dispatched == 0)
8728 ++ bfqq->soft_rt_next_start =
8729 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
8730 ++ else {
8731 ++ /*
8732 ++ * The application is still waiting for the
8733 ++ * completion of one or more requests:
8734 ++ * prevent it from possibly being incorrectly
8735 ++ * deemed as soft real-time by setting its
8736 ++ * soft_rt_next_start to infinity. In fact,
8737 ++ * without this assignment, the application
8738 ++ * would be incorrectly deemed as soft
8739 ++ * real-time if:
8740 ++ * 1) it issued a new request before the
8741 ++ * completion of all its in-flight
8742 ++ * requests, and
8743 ++ * 2) at that time, its soft_rt_next_start
8744 ++ * happened to be in the past.
8745 ++ */
8746 ++ bfqq->soft_rt_next_start =
8747 ++ bfq_infinity_from_now(jiffies);
8748 ++ /*
8749 ++ * Schedule an update of soft_rt_next_start to when
8750 ++ * the task may be discovered to be isochronous.
8751 ++ */
8752 ++ bfq_mark_bfqq_softrt_update(bfqq);
8753 ++ }
8754 ++ }
8755 ++
8756 ++ bfq_log_bfqq(bfqd, bfqq,
8757 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
8758 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
8759 ++
8760 ++ /* Increase, decrease or leave budget unchanged according to reason */
8761 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
8762 ++ __bfq_bfqq_expire(bfqd, bfqq);
8763 ++}
8764 ++
8765 ++/*
8766 ++ * Budget timeout is not implemented through a dedicated timer, but
8767 ++ * just checked on request arrivals and completions, as well as on
8768 ++ * idle timer expirations.
8769 ++ */
8770 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
8771 ++{
8772 ++ if (bfq_bfqq_budget_new(bfqq))
8773 ++ return 0;
8774 ++
8775 ++ if (time_before(jiffies, bfqq->budget_timeout))
8776 ++ return 0;
8777 ++
8778 ++ return 1;
8779 ++}
8780 ++
8781 ++/*
8782 ++ * If we expire a queue that is waiting for the arrival of a new
8783 ++ * request, we may prevent the fictitious timestamp back-shifting that
8784 ++ * allows the guarantees of the queue to be preserved (see [1] for
8785 ++ * this tricky aspect). Hence we return true only if this condition
8786 ++ * does not hold, or if the queue is slow enough to deserve only to be
8787 ++ * kicked off for preserving a high throughput.
8788 ++*/
8789 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
8790 ++{
8791 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
8792 ++ "may_budget_timeout: wr %d left %d timeout %d",
8793 ++ bfq_bfqq_wait_request(bfqq),
8794 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
8795 ++ bfq_bfqq_budget_timeout(bfqq));
8796 ++
8797 ++ return (!bfq_bfqq_wait_request(bfqq) ||
8798 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
8799 ++ &&
8800 ++ bfq_bfqq_budget_timeout(bfqq);
8801 ++}
8802 ++
8803 ++/*
8804 ++ * For weight-raised queues issuing sync requests, idling is always performed,
8805 ++ * as this is instrumental in guaranteeing a high fraction of the throughput
8806 ++ * to these queues, and hence in guaranteeing a lower latency for their
8807 ++ * requests. See [1] for details.
8808 ++ *
8809 ++ * For non-weight-raised queues, idling is instead disabled if the device is
8810 ++ * NCQ-enabled and non-rotational, as this boosts the throughput on such
8811 ++ * devices.
8812 ++ */
8813 ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
8814 ++{
8815 ++ struct bfq_data *bfqd = bfqq->bfqd;
8816 ++
8817 ++ return bfq_bfqq_sync(bfqq) && (
8818 ++ bfqq->raising_coeff > 1 ||
8819 ++ (bfq_bfqq_idle_window(bfqq) &&
8820 ++ !(bfqd->hw_tag &&
8821 ++ (blk_queue_nonrot(bfqd->queue) ||
8822 ++ /*
8823 ++ * If there are weight-raised busy queues, then do not idle
8824 ++ * the disk for a sync non-weight-raised queue, and hence
8825 ++ * expire the queue immediately if empty. Combined with the
8826 ++ * timestamping rules of BFQ (see [1] for details), this
8827 ++ * causes sync non-weight-raised queues to get a lower
8828 ++ * fraction of the disk throughput, and hence reduces the rate
8829 ++ * at which the processes associated to these queues ask for
8830 ++ * requests from the request pool.
8831 ++ *
8832 ++ * This is beneficial for weight-raised processes, when the
8833 ++ * system operates in request-pool saturation conditions
8834 ++ * (e.g., in the presence of write hogs). In fact, if
8835 ++ * non-weight-raised processes ask for requests at a lower
8836 ++ * rate, then weight-raised processes have a higher
8837 ++ * probability to get a request from the pool immediately
8838 ++ * (or at least soon) when they need one. Hence they have a
8839 ++ * higher probability to actually get a fraction of the disk
8840 ++ * throughput proportional to their high weight. This is
8841 ++ * especially true with NCQ-enabled drives, which enqueue
8842 ++ * several requests in advance and further reorder
8843 ++ * internally-queued requests.
8844 ++ *
8845 ++ * Mistreating non-weight-raised queues in the above-described
8846 ++ * way, when there are busy weight-raised queues, seems to
8847 ++ * mitigate starvation problems in the presence of heavy write
8848 ++ * workloads and NCQ, and hence to guarantee a higher
8849 ++ * application and system responsiveness in these hostile
8850 ++ * scenarios.
8851 ++ */
8852 ++ bfqd->raised_busy_queues > 0)
8853 ++ )
8854 ++ )
8855 ++ );
8856 ++}
8857 ++
8858 ++/*
8859 ++ * If the in-service queue is empty, but it is sync and either of the following
8860 ++ * conditions holds, then: 1) the queue must remain in service and cannot be
8861 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
8862 ++ * of a new request for the queue. The conditions are:
8863 ++ * - the device is rotational and not performing NCQ, and the queue has its
8864 ++ * idle window set (in this case, waiting for a new request for the queue
8865 ++ * is likely to boost the disk throughput);
8866 ++ * - the queue is weight-raised (waiting for the request is necessary to
8867 ++ * provide the queue with fairness and latency guarantees, see [1] for
8868 ++ * details).
8869 ++ */
8870 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
8871 ++{
8872 ++ struct bfq_data *bfqd = bfqq->bfqd;
8873 ++
8874 ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
8875 ++ bfq_bfqq_must_not_expire(bfqq) &&
8876 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq);
8877 ++}
8878 ++
8879 ++/*
8880 ++ * Select a queue for service. If we have a current queue in service,
8881 ++ * check whether to continue servicing it, or retrieve and set a new one.
8882 ++ */
8883 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
8884 ++{
8885 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
8886 ++ struct request *next_rq;
8887 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
8888 ++
8889 ++ bfqq = bfqd->in_service_queue;
8890 ++ if (bfqq == NULL)
8891 ++ goto new_queue;
8892 ++
8893 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
8894 ++
8895 ++ /*
8896 ++ * If another queue has a request waiting within our mean seek
8897 ++ * distance, let it run. The expire code will check for close
8898 ++ * cooperators and put the close queue at the front of the
8899 ++ * service tree. If possible, merge the expiring queue with the
8900 ++ * new bfqq.
8901 ++ */
8902 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
8903 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
8904 ++ bfq_setup_merge(bfqq, new_bfqq);
8905 ++
8906 ++ if (bfq_may_expire_for_budg_timeout(bfqq) &&
8907 ++ !timer_pending(&bfqd->idle_slice_timer) &&
8908 ++ !bfq_bfqq_must_idle(bfqq))
8909 ++ goto expire;
8910 ++
8911 ++ next_rq = bfqq->next_rq;
8912 ++ /*
8913 ++ * If bfqq has requests queued and it has enough budget left to
8914 ++ * serve them, keep the queue, otherwise expire it.
8915 ++ */
8916 ++ if (next_rq != NULL) {
8917 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
8918 ++ bfq_bfqq_budget_left(bfqq)) {
8919 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
8920 ++ goto expire;
8921 ++ } else {
8922 ++ /*
8923 ++ * The idle timer may be pending because we may not
8924 ++ * disable disk idling even when a new request arrives
8925 ++ */
8926 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
8927 ++ /*
8928 ++ * If we get here: 1) at least a new request
8929 ++ * has arrived but we have not disabled the
8930 ++ * timer because the request was too small,
8931 ++ * 2) then the block layer has unplugged the
8932 ++ * device, causing the dispatch to be invoked.
8933 ++ *
8934 ++ * Since the device is unplugged, now the
8935 ++ * requests are probably large enough to
8936 ++ * provide a reasonable throughput.
8937 ++ * So we disable idling.
8938 ++ */
8939 ++ bfq_clear_bfqq_wait_request(bfqq);
8940 ++ del_timer(&bfqd->idle_slice_timer);
8941 ++ }
8942 ++ if (new_bfqq == NULL)
8943 ++ goto keep_queue;
8944 ++ else
8945 ++ goto expire;
8946 ++ }
8947 ++ }
8948 ++
8949 ++ /*
8950 ++ * No requests pending. If the in-service queue has no cooperator and
8951 ++ * still has requests in flight (possibly waiting for a completion)
8952 ++ * or is idling for a new request, then keep it.
8953 ++ */
8954 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
8955 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
8956 ++ bfqq = NULL;
8957 ++ goto keep_queue;
8958 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
8959 ++ /*
8960 ++ * Expiring the queue because there is a close cooperator,
8961 ++ * cancel timer.
8962 ++ */
8963 ++ bfq_clear_bfqq_wait_request(bfqq);
8964 ++ del_timer(&bfqd->idle_slice_timer);
8965 ++ }
8966 ++
8967 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
8968 ++expire:
8969 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
8970 ++new_queue:
8971 ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
8972 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
8973 ++ bfqq != NULL ? bfqq->pid : 0);
8974 ++keep_queue:
8975 ++ return bfqq;
8976 ++}
8977 ++
8978 ++static void bfq_update_raising_data(struct bfq_data *bfqd,
8979 ++ struct bfq_queue *bfqq)
8980 ++{
8981 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
8982 ++ struct bfq_entity *entity = &bfqq->entity;
8983 ++
8984 ++ bfq_log_bfqq(bfqd, bfqq,
8985 ++ "raising period dur %u/%u msec, "
8986 ++ "old raising coeff %u, w %d(%d)",
8987 ++ jiffies_to_msecs(jiffies -
8988 ++ bfqq->last_rais_start_finish),
8989 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
8990 ++ bfqq->raising_coeff,
8991 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
8992 ++
8993 ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
8994 ++ entity->orig_weight * bfqq->raising_coeff);
8995 ++ if (entity->ioprio_changed)
8996 ++ bfq_log_bfqq(bfqd, bfqq,
8997 ++ "WARN: pending prio change");
8998 ++ /*
8999 ++ * If too much time has elapsed from the beginning
9000 ++ * of this weight-raising, stop it.
9001 ++ */
9002 ++ if (time_is_before_jiffies(bfqq->last_rais_start_finish +
9003 ++ bfqq->raising_cur_max_time)) {
9004 ++ bfqq->last_rais_start_finish = jiffies;
9005 ++ bfq_log_bfqq(bfqd, bfqq,
9006 ++ "wrais ending at %lu, "
9007 ++ "rais_max_time %u",
9008 ++ bfqq->last_rais_start_finish,
9009 ++ jiffies_to_msecs(bfqq->
9010 ++ raising_cur_max_time));
9011 ++ bfq_bfqq_end_raising(bfqq);
9012 ++ __bfq_entity_update_weight_prio(
9013 ++ bfq_entity_service_tree(entity),
9014 ++ entity);
9015 ++ }
9016 ++ }
9017 ++}
9018 ++
9019 ++/*
9020 ++ * Dispatch one request from bfqq, moving it to the request queue
9021 ++ * dispatch list.
9022 ++ */
9023 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
9024 ++ struct bfq_queue *bfqq)
9025 ++{
9026 ++ int dispatched = 0;
9027 ++ struct request *rq;
9028 ++ unsigned long service_to_charge;
9029 ++
9030 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
9031 ++
9032 ++ /* Follow expired path, else get first next available. */
9033 ++ rq = bfq_check_fifo(bfqq);
9034 ++ if (rq == NULL)
9035 ++ rq = bfqq->next_rq;
9036 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
9037 ++
9038 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
9039 ++ /*
9040 ++ * This may happen if the next rq is chosen
9041 ++ * in fifo order instead of sector order.
9042 ++ * The budget is properly dimensioned
9043 ++ * to be always sufficient to serve the next request
9044 ++ * only if it is chosen in sector order. The reason is
9045 ++ * that it would be quite inefficient and little useful
9046 ++ * to always make sure that the budget is large enough
9047 ++ * to serve even the possible next rq in fifo order.
9048 ++ * In fact, requests are seldom served in fifo order.
9049 ++ *
9050 ++ * Expire the queue for budget exhaustion, and
9051 ++ * make sure that the next act_budget is enough
9052 ++ * to serve the next request, even if it comes
9053 ++ * from the fifo expired path.
9054 ++ */
9055 ++ bfqq->next_rq = rq;
9056 ++ /*
9057 ++ * Since this dispatch is failed, make sure that
9058 ++ * a new one will be performed
9059 ++ */
9060 ++ if (!bfqd->rq_in_driver)
9061 ++ bfq_schedule_dispatch(bfqd);
9062 ++ goto expire;
9063 ++ }
9064 ++
9065 ++ /* Finally, insert request into driver dispatch list. */
9066 ++ bfq_bfqq_served(bfqq, service_to_charge);
9067 ++ bfq_dispatch_insert(bfqd->queue, rq);
9068 ++
9069 ++ bfq_update_raising_data(bfqd, bfqq);
9070 ++
9071 ++ bfq_log_bfqq(bfqd, bfqq,
9072 ++ "dispatched %u sec req (%llu), budg left %lu",
9073 ++ blk_rq_sectors(rq),
9074 ++ (long long unsigned)blk_rq_pos(rq),
9075 ++ bfq_bfqq_budget_left(bfqq));
9076 ++
9077 ++ dispatched++;
9078 ++
9079 ++ if (bfqd->in_service_bic == NULL) {
9080 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
9081 ++ bfqd->in_service_bic = RQ_BIC(rq);
9082 ++ }
9083 ++
9084 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
9085 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
9086 ++ bfq_class_idle(bfqq)))
9087 ++ goto expire;
9088 ++
9089 ++ return dispatched;
9090 ++
9091 ++expire:
9092 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
9093 ++ return dispatched;
9094 ++}
9095 ++
9096 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
9097 ++{
9098 ++ int dispatched = 0;
9099 ++
9100 ++ while (bfqq->next_rq != NULL) {
9101 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
9102 ++ dispatched++;
9103 ++ }
9104 ++
9105 ++ BUG_ON(!list_empty(&bfqq->fifo));
9106 ++ return dispatched;
9107 ++}
9108 ++
9109 ++/*
9110 ++ * Drain our current requests. Used for barriers and when switching
9111 ++ * io schedulers on-the-fly.
9112 ++ */
9113 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
9114 ++{
9115 ++ struct bfq_queue *bfqq, *n;
9116 ++ struct bfq_service_tree *st;
9117 ++ int dispatched = 0;
9118 ++
9119 ++ bfqq = bfqd->in_service_queue;
9120 ++ if (bfqq != NULL)
9121 ++ __bfq_bfqq_expire(bfqd, bfqq);
9122 ++
9123 ++ /*
9124 ++ * Loop through classes, and be careful to leave the scheduler
9125 ++ * in a consistent state, as feedback mechanisms and vtime
9126 ++ * updates cannot be disabled during the process.
9127 ++ */
9128 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
9129 ++ st = bfq_entity_service_tree(&bfqq->entity);
9130 ++
9131 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
9132 ++ bfqq->max_budget = bfq_max_budget(bfqd);
9133 ++
9134 ++ bfq_forget_idle(st);
9135 ++ }
9136 ++
9137 ++ BUG_ON(bfqd->busy_queues != 0);
9138 ++
9139 ++ return dispatched;
9140 ++}
9141 ++
9142 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
9143 ++{
9144 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9145 ++ struct bfq_queue *bfqq;
9146 ++ int max_dispatch;
9147 ++
9148 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
9149 ++ if (bfqd->busy_queues == 0)
9150 ++ return 0;
9151 ++
9152 ++ if (unlikely(force))
9153 ++ return bfq_forced_dispatch(bfqd);
9154 ++
9155 ++ bfqq = bfq_select_queue(bfqd);
9156 ++ if (bfqq == NULL)
9157 ++ return 0;
9158 ++
9159 ++ max_dispatch = bfqd->bfq_quantum;
9160 ++ if (bfq_class_idle(bfqq))
9161 ++ max_dispatch = 1;
9162 ++
9163 ++ if (!bfq_bfqq_sync(bfqq))
9164 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
9165 ++
9166 ++ if (bfqq->dispatched >= max_dispatch) {
9167 ++ if (bfqd->busy_queues > 1)
9168 ++ return 0;
9169 ++ if (bfqq->dispatched >= 4 * max_dispatch)
9170 ++ return 0;
9171 ++ }
9172 ++
9173 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
9174 ++ return 0;
9175 ++
9176 ++ bfq_clear_bfqq_wait_request(bfqq);
9177 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
9178 ++
9179 ++ if (!bfq_dispatch_request(bfqd, bfqq))
9180 ++ return 0;
9181 ++
9182 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
9183 ++ bfqq->pid, max_dispatch);
9184 ++
9185 ++ return 1;
9186 ++}
9187 ++
9188 ++/*
9189 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
9190 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
9191 ++ *
9192 ++ * Queue lock must be held here.
9193 ++ */
9194 ++static void bfq_put_queue(struct bfq_queue *bfqq)
9195 ++{
9196 ++ struct bfq_data *bfqd = bfqq->bfqd;
9197 ++
9198 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
9199 ++
9200 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
9201 ++ atomic_read(&bfqq->ref));
9202 ++ if (!atomic_dec_and_test(&bfqq->ref))
9203 ++ return;
9204 ++
9205 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
9206 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
9207 ++ BUG_ON(bfqq->entity.tree != NULL);
9208 ++ BUG_ON(bfq_bfqq_busy(bfqq));
9209 ++ BUG_ON(bfqd->in_service_queue == bfqq);
9210 ++
9211 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
9212 ++
9213 ++ kmem_cache_free(bfq_pool, bfqq);
9214 ++}
9215 ++
9216 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
9217 ++{
9218 ++ struct bfq_queue *__bfqq, *next;
9219 ++
9220 ++ /*
9221 ++ * If this queue was scheduled to merge with another queue, be
9222 ++ * sure to drop the reference taken on that queue (and others in
9223 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
9224 ++ */
9225 ++ __bfqq = bfqq->new_bfqq;
9226 ++ while (__bfqq) {
9227 ++ if (__bfqq == bfqq) {
9228 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
9229 ++ break;
9230 ++ }
9231 ++ next = __bfqq->new_bfqq;
9232 ++ bfq_put_queue(__bfqq);
9233 ++ __bfqq = next;
9234 ++ }
9235 ++}
9236 ++
9237 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9238 ++{
9239 ++ if (bfqq == bfqd->in_service_queue) {
9240 ++ __bfq_bfqq_expire(bfqd, bfqq);
9241 ++ bfq_schedule_dispatch(bfqd);
9242 ++ }
9243 ++
9244 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
9245 ++ atomic_read(&bfqq->ref));
9246 ++
9247 ++ bfq_put_cooperator(bfqq);
9248 ++
9249 ++ bfq_put_queue(bfqq);
9250 ++}
9251 ++
9252 ++static void bfq_init_icq(struct io_cq *icq)
9253 ++{
9254 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
9255 ++
9256 ++ bic->ttime.last_end_request = jiffies;
9257 ++}
9258 ++
9259 ++static void bfq_exit_icq(struct io_cq *icq)
9260 ++{
9261 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
9262 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
9263 ++
9264 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
9265 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
9266 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
9267 ++ }
9268 ++
9269 ++ if (bic->bfqq[BLK_RW_SYNC]) {
9270 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
9271 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
9272 ++ }
9273 ++}
9274 ++
9275 ++/*
9276 ++ * Update the entity prio values; note that the new values will not
9277 ++ * be used until the next (re)activation.
9278 ++ */
9279 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
9280 ++{
9281 ++ struct task_struct *tsk = current;
9282 ++ int ioprio_class;
9283 ++
9284 ++ if (!bfq_bfqq_prio_changed(bfqq))
9285 ++ return;
9286 ++
9287 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9288 ++ switch (ioprio_class) {
9289 ++ default:
9290 ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
9291 ++ "bfq: bad prio %x\n", ioprio_class);
9292 ++ case IOPRIO_CLASS_NONE:
9293 ++ /*
9294 ++ * No prio set, inherit CPU scheduling settings.
9295 ++ */
9296 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
9297 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
9298 ++ break;
9299 ++ case IOPRIO_CLASS_RT:
9300 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9301 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
9302 ++ break;
9303 ++ case IOPRIO_CLASS_BE:
9304 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9305 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
9306 ++ break;
9307 ++ case IOPRIO_CLASS_IDLE:
9308 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
9309 ++ bfqq->entity.new_ioprio = 7;
9310 ++ bfq_clear_bfqq_idle_window(bfqq);
9311 ++ break;
9312 ++ }
9313 ++
9314 ++ bfqq->entity.ioprio_changed = 1;
9315 ++
9316 ++ /*
9317 ++ * Keep track of original prio settings in case we have to temporarily
9318 ++ * elevate the priority of this queue.
9319 ++ */
9320 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
9321 ++ bfq_clear_bfqq_prio_changed(bfqq);
9322 ++}
9323 ++
9324 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
9325 ++{
9326 ++ struct bfq_data *bfqd;
9327 ++ struct bfq_queue *bfqq, *new_bfqq;
9328 ++ struct bfq_group *bfqg;
9329 ++ unsigned long uninitialized_var(flags);
9330 ++ int ioprio = bic->icq.ioc->ioprio;
9331 ++
9332 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
9333 ++ &flags);
9334 ++ /*
9335 ++ * This condition may trigger on a newly created bic, be sure to drop
9336 ++ * the lock before returning.
9337 ++ */
9338 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
9339 ++ goto out;
9340 ++
9341 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
9342 ++ if (bfqq != NULL) {
9343 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
9344 ++ sched_data);
9345 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
9346 ++ GFP_ATOMIC);
9347 ++ if (new_bfqq != NULL) {
9348 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
9349 ++ bfq_log_bfqq(bfqd, bfqq,
9350 ++ "changed_ioprio: bfqq %p %d",
9351 ++ bfqq, atomic_read(&bfqq->ref));
9352 ++ bfq_put_queue(bfqq);
9353 ++ }
9354 ++ }
9355 ++
9356 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
9357 ++ if (bfqq != NULL)
9358 ++ bfq_mark_bfqq_prio_changed(bfqq);
9359 ++
9360 ++ bic->ioprio = ioprio;
9361 ++
9362 ++out:
9363 ++ bfq_put_bfqd_unlock(bfqd, &flags);
9364 ++}
9365 ++
9366 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9367 ++ pid_t pid, int is_sync)
9368 ++{
9369 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
9370 ++ INIT_LIST_HEAD(&bfqq->fifo);
9371 ++
9372 ++ atomic_set(&bfqq->ref, 0);
9373 ++ bfqq->bfqd = bfqd;
9374 ++
9375 ++ bfq_mark_bfqq_prio_changed(bfqq);
9376 ++
9377 ++ if (is_sync) {
9378 ++ if (!bfq_class_idle(bfqq))
9379 ++ bfq_mark_bfqq_idle_window(bfqq);
9380 ++ bfq_mark_bfqq_sync(bfqq);
9381 ++ }
9382 ++
9383 ++ /* Tentative initial value to trade off between thr and lat */
9384 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
9385 ++ bfqq->pid = pid;
9386 ++
9387 ++ bfqq->raising_coeff = 1;
9388 ++ bfqq->last_rais_start_finish = 0;
9389 ++ /*
9390 ++ * Set to the value for which bfqq will not be deemed as
9391 ++ * soft rt when it becomes backlogged.
9392 ++ */
9393 ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
9394 ++}
9395 ++
9396 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
9397 ++ struct bfq_group *bfqg,
9398 ++ int is_sync,
9399 ++ struct bfq_io_cq *bic,
9400 ++ gfp_t gfp_mask)
9401 ++{
9402 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
9403 ++
9404 ++retry:
9405 ++ /* bic always exists here */
9406 ++ bfqq = bic_to_bfqq(bic, is_sync);
9407 ++
9408 ++ /*
9409 ++ * Always try a new alloc if we fall back to the OOM bfqq
9410 ++ * originally, since it should just be a temporary situation.
9411 ++ */
9412 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9413 ++ bfqq = NULL;
9414 ++ if (new_bfqq != NULL) {
9415 ++ bfqq = new_bfqq;
9416 ++ new_bfqq = NULL;
9417 ++ } else if (gfp_mask & __GFP_WAIT) {
9418 ++ spin_unlock_irq(bfqd->queue->queue_lock);
9419 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
9420 ++ gfp_mask | __GFP_ZERO,
9421 ++ bfqd->queue->node);
9422 ++ spin_lock_irq(bfqd->queue->queue_lock);
9423 ++ if (new_bfqq != NULL)
9424 ++ goto retry;
9425 ++ } else {
9426 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
9427 ++ gfp_mask | __GFP_ZERO,
9428 ++ bfqd->queue->node);
9429 ++ }
9430 ++
9431 ++ if (bfqq != NULL) {
9432 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
9433 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
9434 ++ } else {
9435 ++ bfqq = &bfqd->oom_bfqq;
9436 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
9437 ++ }
9438 ++
9439 ++ bfq_init_prio_data(bfqq, bic);
9440 ++ bfq_init_entity(&bfqq->entity, bfqg);
9441 ++ }
9442 ++
9443 ++ if (new_bfqq != NULL)
9444 ++ kmem_cache_free(bfq_pool, new_bfqq);
9445 ++
9446 ++ return bfqq;
9447 ++}
9448 ++
9449 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
9450 ++ struct bfq_group *bfqg,
9451 ++ int ioprio_class, int ioprio)
9452 ++{
9453 ++ switch (ioprio_class) {
9454 ++ case IOPRIO_CLASS_RT:
9455 ++ return &bfqg->async_bfqq[0][ioprio];
9456 ++ case IOPRIO_CLASS_NONE:
9457 ++ ioprio = IOPRIO_NORM;
9458 ++ /* fall through */
9459 ++ case IOPRIO_CLASS_BE:
9460 ++ return &bfqg->async_bfqq[1][ioprio];
9461 ++ case IOPRIO_CLASS_IDLE:
9462 ++ return &bfqg->async_idle_bfqq;
9463 ++ default:
9464 ++ BUG();
9465 ++ }
9466 ++}
9467 ++
9468 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
9469 ++ struct bfq_group *bfqg, int is_sync,
9470 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
9471 ++{
9472 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9473 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9474 ++ struct bfq_queue **async_bfqq = NULL;
9475 ++ struct bfq_queue *bfqq = NULL;
9476 ++
9477 ++ if (!is_sync) {
9478 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
9479 ++ ioprio);
9480 ++ bfqq = *async_bfqq;
9481 ++ }
9482 ++
9483 ++ if (bfqq == NULL)
9484 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9485 ++
9486 ++ /*
9487 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
9488 ++ */
9489 ++ if (!is_sync && *async_bfqq == NULL) {
9490 ++ atomic_inc(&bfqq->ref);
9491 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
9492 ++ bfqq, atomic_read(&bfqq->ref));
9493 ++ *async_bfqq = bfqq;
9494 ++ }
9495 ++
9496 ++ atomic_inc(&bfqq->ref);
9497 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
9498 ++ atomic_read(&bfqq->ref));
9499 ++ return bfqq;
9500 ++}
9501 ++
9502 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
9503 ++ struct bfq_io_cq *bic)
9504 ++{
9505 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
9506 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
9507 ++
9508 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
9509 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
9510 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
9511 ++ bic->ttime.ttime_samples;
9512 ++}
9513 ++
9514 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
9515 ++ struct bfq_queue *bfqq,
9516 ++ struct request *rq)
9517 ++{
9518 ++ sector_t sdist;
9519 ++ u64 total;
9520 ++
9521 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
9522 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
9523 ++ else
9524 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
9525 ++
9526 ++ /*
9527 ++ * Don't allow the seek distance to get too large from the
9528 ++ * odd fragment, pagein, etc.
9529 ++ */
9530 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
9531 ++ sdist = 0;
9532 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
9533 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
9534 ++ else
9535 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
9536 ++
9537 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
9538 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
9539 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
9540 ++ do_div(total, bfqq->seek_samples);
9541 ++ bfqq->seek_mean = (sector_t)total;
9542 ++
9543 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
9544 ++ (u64)bfqq->seek_mean);
9545 ++}
9546 ++
9547 ++/*
9548 ++ * Disable idle window if the process thinks too long or seeks so much that
9549 ++ * it doesn't matter.
9550 ++ */
9551 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
9552 ++ struct bfq_queue *bfqq,
9553 ++ struct bfq_io_cq *bic)
9554 ++{
9555 ++ int enable_idle;
9556 ++
9557 ++ /* Don't idle for async or idle io prio class. */
9558 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
9559 ++ return;
9560 ++
9561 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
9562 ++
9563 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
9564 ++ bfqd->bfq_slice_idle == 0 ||
9565 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
9566 ++ bfqq->raising_coeff == 1))
9567 ++ enable_idle = 0;
9568 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
9569 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
9570 ++ bfqq->raising_coeff == 1)
9571 ++ enable_idle = 0;
9572 ++ else
9573 ++ enable_idle = 1;
9574 ++ }
9575 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
9576 ++ enable_idle);
9577 ++
9578 ++ if (enable_idle)
9579 ++ bfq_mark_bfqq_idle_window(bfqq);
9580 ++ else
9581 ++ bfq_clear_bfqq_idle_window(bfqq);
9582 ++}
9583 ++
9584 ++/*
9585 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
9586 ++ * something we should do about it.
9587 ++ */
9588 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9589 ++ struct request *rq)
9590 ++{
9591 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
9592 ++
9593 ++ if (rq->cmd_flags & REQ_META)
9594 ++ bfqq->meta_pending++;
9595 ++
9596 ++ bfq_update_io_thinktime(bfqd, bic);
9597 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
9598 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
9599 ++ !BFQQ_SEEKY(bfqq))
9600 ++ bfq_update_idle_window(bfqd, bfqq, bic);
9601 ++
9602 ++ bfq_log_bfqq(bfqd, bfqq,
9603 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
9604 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
9605 ++ (long long unsigned)bfqq->seek_mean);
9606 ++
9607 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
9608 ++
9609 ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
9610 ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
9611 ++ blk_rq_sectors(rq) < 32;
9612 ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
9613 ++
9614 ++ /*
9615 ++ * There is just this request queued: if the request
9616 ++ * is small and the queue is not to be expired, then
9617 ++ * just exit.
9618 ++ *
9619 ++ * In this way, if the disk is being idled to wait for
9620 ++ * a new request from the in-service queue, we avoid
9621 ++ * unplugging the device and committing the disk to serve
9622 ++ * just a small request. On the contrary, we wait for
9623 ++ * the block layer to decide when to unplug the device:
9624 ++ * hopefully, new requests will be merged to this one
9625 ++ * quickly, then the device will be unplugged and
9626 ++ * larger requests will be dispatched.
9627 ++ */
9628 ++ if (small_req && !budget_timeout)
9629 ++ return;
9630 ++
9631 ++ /*
9632 ++ * A large enough request arrived, or the queue is to
9633 ++ * be expired: in both cases disk idling is to be
9634 ++ * stopped, so clear wait_request flag and reset
9635 ++ * timer.
9636 ++ */
9637 ++ bfq_clear_bfqq_wait_request(bfqq);
9638 ++ del_timer(&bfqd->idle_slice_timer);
9639 ++
9640 ++ /*
9641 ++ * The queue is not empty, because a new request just
9642 ++ * arrived. Hence we can safely expire the queue, in
9643 ++ * case of budget timeout, without risking that the
9644 ++ * timestamps of the queue are not updated correctly.
9645 ++ * See [1] for more details.
9646 ++ */
9647 ++ if (budget_timeout)
9648 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9649 ++
9650 ++ /*
9651 ++ * Let the request rip immediately, or let a new queue be
9652 ++ * selected if bfqq has just been expired.
9653 ++ */
9654 ++ __blk_run_queue(bfqd->queue);
9655 ++ }
9656 ++}
9657 ++
9658 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
9659 ++{
9660 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9661 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9662 ++
9663 ++ assert_spin_locked(bfqd->queue->queue_lock);
9664 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
9665 ++
9666 ++ bfq_add_rq_rb(rq);
9667 ++
9668 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
9669 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
9670 ++
9671 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
9672 ++}
9673 ++
9674 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
9675 ++{
9676 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
9677 ++ bfqd->rq_in_driver);
9678 ++
9679 ++ if (bfqd->hw_tag == 1)
9680 ++ return;
9681 ++
9682 ++ /*
9683 ++ * This sample is valid if the number of outstanding requests
9684 ++ * is large enough to allow a queueing behavior. Note that the
9685 ++ * sum is not exact, as it's not taking into account deactivated
9686 ++ * requests.
9687 ++ */
9688 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
9689 ++ return;
9690 ++
9691 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
9692 ++ return;
9693 ++
9694 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
9695 ++ bfqd->max_rq_in_driver = 0;
9696 ++ bfqd->hw_tag_samples = 0;
9697 ++}
9698 ++
9699 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
9700 ++{
9701 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9702 ++ struct bfq_data *bfqd = bfqq->bfqd;
9703 ++ const int sync = rq_is_sync(rq);
9704 ++
9705 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
9706 ++ blk_rq_sectors(rq), sync);
9707 ++
9708 ++ bfq_update_hw_tag(bfqd);
9709 ++
9710 ++ WARN_ON(!bfqd->rq_in_driver);
9711 ++ WARN_ON(!bfqq->dispatched);
9712 ++ bfqd->rq_in_driver--;
9713 ++ bfqq->dispatched--;
9714 ++
9715 ++ if (bfq_bfqq_sync(bfqq))
9716 ++ bfqd->sync_flight--;
9717 ++
9718 ++ if (sync)
9719 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
9720 ++
9721 ++ /*
9722 ++ * If we are waiting to discover whether the request pattern of the
9723 ++ * task associated with the queue is actually isochronous, and
9724 ++ * both requisites for this condition to hold are satisfied, then
9725 ++ * compute soft_rt_next_start (see the comments to the function
9726 ++ * bfq_bfqq_softrt_next_start()).
9727 ++ */
9728 ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
9729 ++ RB_EMPTY_ROOT(&bfqq->sort_list))
9730 ++ bfqq->soft_rt_next_start =
9731 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
9732 ++
9733 ++ /*
9734 ++ * If this is the in-service queue, check if it needs to be expired,
9735 ++ * or if we want to idle in case it has no pending requests.
9736 ++ */
9737 ++ if (bfqd->in_service_queue == bfqq) {
9738 ++ if (bfq_bfqq_budget_new(bfqq))
9739 ++ bfq_set_budget_timeout(bfqd);
9740 ++
9741 ++ if (bfq_bfqq_must_idle(bfqq)) {
9742 ++ bfq_arm_slice_timer(bfqd);
9743 ++ goto out;
9744 ++ } else if (bfq_may_expire_for_budg_timeout(bfqq))
9745 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9746 ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
9747 ++ (bfqq->dispatched == 0 ||
9748 ++ !bfq_bfqq_must_not_expire(bfqq)))
9749 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
9750 ++ BFQ_BFQQ_NO_MORE_REQUESTS);
9751 ++ }
9752 ++
9753 ++ if (!bfqd->rq_in_driver)
9754 ++ bfq_schedule_dispatch(bfqd);
9755 ++
9756 ++out:
9757 ++ return;
9758 ++}
9759 ++
9760 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
9761 ++{
9762 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
9763 ++ bfq_clear_bfqq_must_alloc(bfqq);
9764 ++ return ELV_MQUEUE_MUST;
9765 ++ }
9766 ++
9767 ++ return ELV_MQUEUE_MAY;
9768 ++}
9769 ++
9770 ++static int bfq_may_queue(struct request_queue *q, int rw)
9771 ++{
9772 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9773 ++ struct task_struct *tsk = current;
9774 ++ struct bfq_io_cq *bic;
9775 ++ struct bfq_queue *bfqq;
9776 ++
9777 ++ /*
9778 ++ * Don't force setup of a queue from here, as a call to may_queue
9779 ++ * does not necessarily imply that a request actually will be queued.
9780 ++ * So just lookup a possibly existing queue, or return 'may queue'
9781 ++ * if that fails.
9782 ++ */
9783 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
9784 ++ if (bic == NULL)
9785 ++ return ELV_MQUEUE_MAY;
9786 ++
9787 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
9788 ++ if (bfqq != NULL) {
9789 ++ bfq_init_prio_data(bfqq, bic);
9790 ++
9791 ++ return __bfq_may_queue(bfqq);
9792 ++ }
9793 ++
9794 ++ return ELV_MQUEUE_MAY;
9795 ++}
9796 ++
9797 ++/*
9798 ++ * Queue lock held here.
9799 ++ */
9800 ++static void bfq_put_request(struct request *rq)
9801 ++{
9802 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9803 ++
9804 ++ if (bfqq != NULL) {
9805 ++ const int rw = rq_data_dir(rq);
9806 ++
9807 ++ BUG_ON(!bfqq->allocated[rw]);
9808 ++ bfqq->allocated[rw]--;
9809 ++
9810 ++ rq->elv.priv[0] = NULL;
9811 ++ rq->elv.priv[1] = NULL;
9812 ++
9813 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
9814 ++ bfqq, atomic_read(&bfqq->ref));
9815 ++ bfq_put_queue(bfqq);
9816 ++ }
9817 ++}
9818 ++
9819 ++static struct bfq_queue *
9820 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
9821 ++ struct bfq_queue *bfqq)
9822 ++{
9823 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
9824 ++ (long unsigned)bfqq->new_bfqq->pid);
9825 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
9826 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
9827 ++ bfq_put_queue(bfqq);
9828 ++ return bic_to_bfqq(bic, 1);
9829 ++}
9830 ++
9831 ++/*
9832 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
9833 ++ * was the last process referring to said bfqq.
9834 ++ */
9835 ++static struct bfq_queue *
9836 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
9837 ++{
9838 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
9839 ++ if (bfqq_process_refs(bfqq) == 1) {
9840 ++ bfqq->pid = current->pid;
9841 ++ bfq_clear_bfqq_coop(bfqq);
9842 ++ bfq_clear_bfqq_split_coop(bfqq);
9843 ++ return bfqq;
9844 ++ }
9845 ++
9846 ++ bic_set_bfqq(bic, NULL, 1);
9847 ++
9848 ++ bfq_put_cooperator(bfqq);
9849 ++
9850 ++ bfq_put_queue(bfqq);
9851 ++ return NULL;
9852 ++}
9853 ++
9854 ++/*
9855 ++ * Allocate bfq data structures associated with this request.
9856 ++ */
9857 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
9858 ++ struct bio *bio, gfp_t gfp_mask)
9859 ++{
9860 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9861 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
9862 ++ const int rw = rq_data_dir(rq);
9863 ++ const int is_sync = rq_is_sync(rq);
9864 ++ struct bfq_queue *bfqq;
9865 ++ struct bfq_group *bfqg;
9866 ++ unsigned long flags;
9867 ++
9868 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
9869 ++
9870 ++ bfq_changed_ioprio(bic);
9871 ++
9872 ++ spin_lock_irqsave(q->queue_lock, flags);
9873 ++
9874 ++ if (bic == NULL)
9875 ++ goto queue_fail;
9876 ++
9877 ++ bfqg = bfq_bic_update_cgroup(bic);
9878 ++
9879 ++new_queue:
9880 ++ bfqq = bic_to_bfqq(bic, is_sync);
9881 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9882 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9883 ++ bic_set_bfqq(bic, bfqq, is_sync);
9884 ++ } else {
9885 ++ /*
9886 ++ * If the queue was seeky for too long, break it apart.
9887 ++ */
9888 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
9889 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
9890 ++ bfqq = bfq_split_bfqq(bic, bfqq);
9891 ++ if (!bfqq)
9892 ++ goto new_queue;
9893 ++ }
9894 ++
9895 ++ /*
9896 ++ * Check to see if this queue is scheduled to merge with
9897 ++ * another closely cooperating queue. The merging of queues
9898 ++ * happens here as it must be done in process context.
9899 ++ * The reference on new_bfqq was taken in merge_bfqqs.
9900 ++ */
9901 ++ if (bfqq->new_bfqq != NULL)
9902 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
9903 ++ }
9904 ++
9905 ++ bfqq->allocated[rw]++;
9906 ++ atomic_inc(&bfqq->ref);
9907 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
9908 ++ atomic_read(&bfqq->ref));
9909 ++
9910 ++ rq->elv.priv[0] = bic;
9911 ++ rq->elv.priv[1] = bfqq;
9912 ++
9913 ++ spin_unlock_irqrestore(q->queue_lock, flags);
9914 ++
9915 ++ return 0;
9916 ++
9917 ++queue_fail:
9918 ++ bfq_schedule_dispatch(bfqd);
9919 ++ spin_unlock_irqrestore(q->queue_lock, flags);
9920 ++
9921 ++ return 1;
9922 ++}
9923 ++
9924 ++static void bfq_kick_queue(struct work_struct *work)
9925 ++{
9926 ++ struct bfq_data *bfqd =
9927 ++ container_of(work, struct bfq_data, unplug_work);
9928 ++ struct request_queue *q = bfqd->queue;
9929 ++
9930 ++ spin_lock_irq(q->queue_lock);
9931 ++ __blk_run_queue(q);
9932 ++ spin_unlock_irq(q->queue_lock);
9933 ++}
9934 ++
9935 ++/*
9936 ++ * Handler of the expiration of the timer running if the in-service queue
9937 ++ * is idling inside its time slice.
9938 ++ */
9939 ++static void bfq_idle_slice_timer(unsigned long data)
9940 ++{
9941 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
9942 ++ struct bfq_queue *bfqq;
9943 ++ unsigned long flags;
9944 ++ enum bfqq_expiration reason;
9945 ++
9946 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
9947 ++
9948 ++ bfqq = bfqd->in_service_queue;
9949 ++ /*
9950 ++ * Theoretical race here: the in-service queue can be NULL or different
9951 ++ * from the queue that was idling if the timer handler spins on
9952 ++ * the queue_lock and a new request arrives for the current
9953 ++ * queue and there is a full dispatch cycle that changes the
9954 ++ * in-service queue. This can hardly happen, but in the worst case
9955 ++ * we just expire a queue too early.
9956 ++ */
9957 ++ if (bfqq != NULL) {
9958 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
9959 ++ if (bfq_bfqq_budget_timeout(bfqq))
9960 ++ /*
9961 ++ * Also here the queue can be safely expired
9962 ++ * for budget timeout without wasting
9963 ++ * guarantees
9964 ++ */
9965 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
9966 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
9967 ++ /*
9968 ++ * The queue may not be empty upon timer expiration,
9969 ++ * because we may not disable the timer when the first
9970 ++ * request of the in-service queue arrives during
9971 ++ * disk idling
9972 ++ */
9973 ++ reason = BFQ_BFQQ_TOO_IDLE;
9974 ++ else
9975 ++ goto schedule_dispatch;
9976 ++
9977 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
9978 ++ }
9979 ++
9980 ++schedule_dispatch:
9981 ++ bfq_schedule_dispatch(bfqd);
9982 ++
9983 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
9984 ++}
9985 ++
9986 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
9987 ++{
9988 ++ del_timer_sync(&bfqd->idle_slice_timer);
9989 ++ cancel_work_sync(&bfqd->unplug_work);
9990 ++}
9991 ++
9992 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
9993 ++ struct bfq_queue **bfqq_ptr)
9994 ++{
9995 ++ struct bfq_group *root_group = bfqd->root_group;
9996 ++ struct bfq_queue *bfqq = *bfqq_ptr;
9997 ++
9998 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
9999 ++ if (bfqq != NULL) {
10000 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
10001 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
10002 ++ bfqq, atomic_read(&bfqq->ref));
10003 ++ bfq_put_queue(bfqq);
10004 ++ *bfqq_ptr = NULL;
10005 ++ }
10006 ++}
10007 ++
10008 ++/*
10009 ++ * Release all the bfqg references to its async queues. If we are
10010 ++ * deallocating the group these queues may still contain requests, so
10011 ++ * we reparent them to the root cgroup (i.e., the only one that will
10012 ++ * exist for sure until all the requests on a device are gone).
10013 ++ */
10014 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
10015 ++{
10016 ++ int i, j;
10017 ++
10018 ++ for (i = 0; i < 2; i++)
10019 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
10020 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
10021 ++
10022 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
10023 ++}
10024 ++
10025 ++static void bfq_exit_queue(struct elevator_queue *e)
10026 ++{
10027 ++ struct bfq_data *bfqd = e->elevator_data;
10028 ++ struct request_queue *q = bfqd->queue;
10029 ++ struct bfq_queue *bfqq, *n;
10030 ++
10031 ++ bfq_shutdown_timer_wq(bfqd);
10032 ++
10033 ++ spin_lock_irq(q->queue_lock);
10034 ++
10035 ++ BUG_ON(bfqd->in_service_queue != NULL);
10036 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
10037 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
10038 ++
10039 ++ bfq_disconnect_groups(bfqd);
10040 ++ spin_unlock_irq(q->queue_lock);
10041 ++
10042 ++ bfq_shutdown_timer_wq(bfqd);
10043 ++
10044 ++ synchronize_rcu();
10045 ++
10046 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
10047 ++
10048 ++ bfq_free_root_group(bfqd);
10049 ++ kfree(bfqd);
10050 ++}
10051 ++
10052 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
10053 ++{
10054 ++ struct bfq_group *bfqg;
10055 ++ struct bfq_data *bfqd;
10056 ++ struct elevator_queue *eq;
10057 ++
10058 ++ eq = elevator_alloc(q, e);
10059 ++ if (eq == NULL)
10060 ++ return -ENOMEM;
10061 ++
10062 ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
10063 ++ if (bfqd == NULL) {
10064 ++ kobject_put(&eq->kobj);
10065 ++ return -ENOMEM;
10066 ++ }
10067 ++ eq->elevator_data = bfqd;
10068 ++
10069 ++ /*
10070 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
10071 ++ * Grab a permanent reference to it, so that the normal code flow
10072 ++ * will not attempt to free it.
10073 ++ */
10074 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
10075 ++ atomic_inc(&bfqd->oom_bfqq.ref);
10076 ++
10077 ++ bfqd->queue = q;
10078 ++
10079 ++ spin_lock_irq(q->queue_lock);
10080 ++ q->elevator = eq;
10081 ++ spin_unlock_irq(q->queue_lock);
10082 ++
10083 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
10084 ++ if (bfqg == NULL) {
10085 ++ kfree(bfqd);
10086 ++ kobject_put(&eq->kobj);
10087 ++ return -ENOMEM;
10088 ++ }
10089 ++
10090 ++ bfqd->root_group = bfqg;
10091 ++
10092 ++ init_timer(&bfqd->idle_slice_timer);
10093 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
10094 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
10095 ++
10096 ++ bfqd->rq_pos_tree = RB_ROOT;
10097 ++
10098 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
10099 ++
10100 ++ INIT_LIST_HEAD(&bfqd->active_list);
10101 ++ INIT_LIST_HEAD(&bfqd->idle_list);
10102 ++
10103 ++ bfqd->hw_tag = -1;
10104 ++
10105 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
10106 ++
10107 ++ bfqd->bfq_quantum = bfq_quantum;
10108 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
10109 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
10110 ++ bfqd->bfq_back_max = bfq_back_max;
10111 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
10112 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
10113 ++ bfqd->bfq_class_idle_last_service = 0;
10114 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
10115 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
10116 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
10117 ++
10118 ++ bfqd->low_latency = true;
10119 ++
10120 ++ bfqd->bfq_raising_coeff = 20;
10121 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
10122 ++ bfqd->bfq_raising_max_time = 0;
10123 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
10124 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
10125 ++ bfqd->bfq_raising_max_softrt_rate = 7000; /*
10126 ++ * Approximate rate required
10127 ++ * to playback or record a
10128 ++ * high-definition compressed
10129 ++ * video.
10130 ++ */
10131 ++ bfqd->raised_busy_queues = 0;
10132 ++
10133 ++ /* Initially estimate the device's peak rate as the reference rate */
10134 ++ if (blk_queue_nonrot(bfqd->queue)) {
10135 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
10136 ++ bfqd->peak_rate = R_nonrot;
10137 ++ } else {
10138 ++ bfqd->RT_prod = R_rot * T_rot;
10139 ++ bfqd->peak_rate = R_rot;
10140 ++ }
10141 ++
10142 ++ return 0;
10143 ++}
10144 ++
10145 ++static void bfq_slab_kill(void)
10146 ++{
10147 ++ if (bfq_pool != NULL)
10148 ++ kmem_cache_destroy(bfq_pool);
10149 ++}
10150 ++
10151 ++static int __init bfq_slab_setup(void)
10152 ++{
10153 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
10154 ++ if (bfq_pool == NULL)
10155 ++ return -ENOMEM;
10156 ++ return 0;
10157 ++}
10158 ++
10159 ++static ssize_t bfq_var_show(unsigned int var, char *page)
10160 ++{
10161 ++ return sprintf(page, "%d\n", var);
10162 ++}
10163 ++
10164 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
10165 ++{
10166 ++ unsigned long new_val;
10167 ++ int ret = kstrtoul(page, 10, &new_val);
10168 ++
10169 ++ if (ret == 0)
10170 ++ *var = new_val;
10171 ++
10172 ++ return count;
10173 ++}
10174 ++
10175 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
10176 ++{
10177 ++ struct bfq_data *bfqd = e->elevator_data;
10178 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
10179 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
10180 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
10181 ++}
10182 ++
10183 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
10184 ++{
10185 ++ struct bfq_queue *bfqq;
10186 ++ struct bfq_data *bfqd = e->elevator_data;
10187 ++ ssize_t num_char = 0;
10188 ++
10189 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
10190 ++ bfqd->queued);
10191 ++
10192 ++ spin_lock_irq(bfqd->queue->queue_lock);
10193 ++
10194 ++ num_char += sprintf(page + num_char, "Active:\n");
10195 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
10196 ++ num_char += sprintf(page + num_char,
10197 ++ "pid%d: weight %hu, nr_queued %d %d,"
10198 ++ " dur %d/%u\n",
10199 ++ bfqq->pid,
10200 ++ bfqq->entity.weight,
10201 ++ bfqq->queued[0],
10202 ++ bfqq->queued[1],
10203 ++ jiffies_to_msecs(jiffies -
10204 ++ bfqq->last_rais_start_finish),
10205 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
10206 ++ }
10207 ++
10208 ++ num_char += sprintf(page + num_char, "Idle:\n");
10209 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
10210 ++ num_char += sprintf(page + num_char,
10211 ++ "pid%d: weight %hu, dur %d/%u\n",
10212 ++ bfqq->pid,
10213 ++ bfqq->entity.weight,
10214 ++ jiffies_to_msecs(jiffies -
10215 ++ bfqq->last_rais_start_finish),
10216 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
10217 ++ }
10218 ++
10219 ++ spin_unlock_irq(bfqd->queue->queue_lock);
10220 ++
10221 ++ return num_char;
10222 ++}
10223 ++
10224 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
10225 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
10226 ++{ \
10227 ++ struct bfq_data *bfqd = e->elevator_data; \
10228 ++ unsigned int __data = __VAR; \
10229 ++ if (__CONV) \
10230 ++ __data = jiffies_to_msecs(__data); \
10231 ++ return bfq_var_show(__data, (page)); \
10232 ++}
10233 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
10234 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
10235 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
10236 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
10237 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
10238 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
10239 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
10240 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
10241 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
10242 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
10243 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
10244 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
10245 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
10246 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
10247 ++ 1);
10248 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
10249 ++ bfqd->bfq_raising_min_inter_arr_async,
10250 ++ 1);
10251 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
10252 ++ bfqd->bfq_raising_max_softrt_rate, 0);
10253 ++#undef SHOW_FUNCTION
10254 ++
10255 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
10256 ++static ssize_t \
10257 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
10258 ++{ \
10259 ++ struct bfq_data *bfqd = e->elevator_data; \
10260 ++ unsigned long uninitialized_var(__data); \
10261 ++ int ret = bfq_var_store(&__data, (page), count); \
10262 ++ if (__data < (MIN)) \
10263 ++ __data = (MIN); \
10264 ++ else if (__data > (MAX)) \
10265 ++ __data = (MAX); \
10266 ++ if (__CONV) \
10267 ++ *(__PTR) = msecs_to_jiffies(__data); \
10268 ++ else \
10269 ++ *(__PTR) = __data; \
10270 ++ return ret; \
10271 ++}
10272 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
10273 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
10274 ++ INT_MAX, 1);
10275 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
10276 ++ INT_MAX, 1);
10277 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
10278 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
10279 ++ INT_MAX, 0);
10280 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
10281 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
10282 ++ 1, INT_MAX, 0);
10283 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
10284 ++ INT_MAX, 1);
10285 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
10286 ++ INT_MAX, 0);
10287 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
10288 ++ INT_MAX, 1);
10289 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
10290 ++ INT_MAX, 1);
10291 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
10292 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
10293 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
10294 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
10295 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
10296 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
10297 ++#undef STORE_FUNCTION
10298 ++
10299 ++/* do nothing for the moment */
10300 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
10301 ++ const char *page, size_t count)
10302 ++{
10303 ++ return count;
10304 ++}
10305 ++
10306 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
10307 ++{
10308 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
10309 ++
10310 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
10311 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
10312 ++ else
10313 ++ return bfq_default_max_budget;
10314 ++}
10315 ++
10316 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
10317 ++ const char *page, size_t count)
10318 ++{
10319 ++ struct bfq_data *bfqd = e->elevator_data;
10320 ++ unsigned long uninitialized_var(__data);
10321 ++ int ret = bfq_var_store(&__data, (page), count);
10322 ++
10323 ++ if (__data == 0)
10324 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10325 ++ else {
10326 ++ if (__data > INT_MAX)
10327 ++ __data = INT_MAX;
10328 ++ bfqd->bfq_max_budget = __data;
10329 ++ }
10330 ++
10331 ++ bfqd->bfq_user_max_budget = __data;
10332 ++
10333 ++ return ret;
10334 ++}
10335 ++
10336 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
10337 ++ const char *page, size_t count)
10338 ++{
10339 ++ struct bfq_data *bfqd = e->elevator_data;
10340 ++ unsigned long uninitialized_var(__data);
10341 ++ int ret = bfq_var_store(&__data, (page), count);
10342 ++
10343 ++ if (__data < 1)
10344 ++ __data = 1;
10345 ++ else if (__data > INT_MAX)
10346 ++ __data = INT_MAX;
10347 ++
10348 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
10349 ++ if (bfqd->bfq_user_max_budget == 0)
10350 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10351 ++
10352 ++ return ret;
10353 ++}
10354 ++
10355 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
10356 ++ const char *page, size_t count)
10357 ++{
10358 ++ struct bfq_data *bfqd = e->elevator_data;
10359 ++ unsigned long uninitialized_var(__data);
10360 ++ int ret = bfq_var_store(&__data, (page), count);
10361 ++
10362 ++ if (__data > 1)
10363 ++ __data = 1;
10364 ++ if (__data == 0 && bfqd->low_latency != 0)
10365 ++ bfq_end_raising(bfqd);
10366 ++ bfqd->low_latency = __data;
10367 ++
10368 ++ return ret;
10369 ++}
10370 ++
10371 ++#define BFQ_ATTR(name) \
10372 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
10373 ++
10374 ++static struct elv_fs_entry bfq_attrs[] = {
10375 ++ BFQ_ATTR(quantum),
10376 ++ BFQ_ATTR(fifo_expire_sync),
10377 ++ BFQ_ATTR(fifo_expire_async),
10378 ++ BFQ_ATTR(back_seek_max),
10379 ++ BFQ_ATTR(back_seek_penalty),
10380 ++ BFQ_ATTR(slice_idle),
10381 ++ BFQ_ATTR(max_budget),
10382 ++ BFQ_ATTR(max_budget_async_rq),
10383 ++ BFQ_ATTR(timeout_sync),
10384 ++ BFQ_ATTR(timeout_async),
10385 ++ BFQ_ATTR(low_latency),
10386 ++ BFQ_ATTR(raising_coeff),
10387 ++ BFQ_ATTR(raising_max_time),
10388 ++ BFQ_ATTR(raising_rt_max_time),
10389 ++ BFQ_ATTR(raising_min_idle_time),
10390 ++ BFQ_ATTR(raising_min_inter_arr_async),
10391 ++ BFQ_ATTR(raising_max_softrt_rate),
10392 ++ BFQ_ATTR(weights),
10393 ++ __ATTR_NULL
10394 ++};
10395 ++
10396 ++static struct elevator_type iosched_bfq = {
10397 ++ .ops = {
10398 ++ .elevator_merge_fn = bfq_merge,
10399 ++ .elevator_merged_fn = bfq_merged_request,
10400 ++ .elevator_merge_req_fn = bfq_merged_requests,
10401 ++ .elevator_allow_merge_fn = bfq_allow_merge,
10402 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
10403 ++ .elevator_add_req_fn = bfq_insert_request,
10404 ++ .elevator_activate_req_fn = bfq_activate_request,
10405 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
10406 ++ .elevator_completed_req_fn = bfq_completed_request,
10407 ++ .elevator_former_req_fn = elv_rb_former_request,
10408 ++ .elevator_latter_req_fn = elv_rb_latter_request,
10409 ++ .elevator_init_icq_fn = bfq_init_icq,
10410 ++ .elevator_exit_icq_fn = bfq_exit_icq,
10411 ++ .elevator_set_req_fn = bfq_set_request,
10412 ++ .elevator_put_req_fn = bfq_put_request,
10413 ++ .elevator_may_queue_fn = bfq_may_queue,
10414 ++ .elevator_init_fn = bfq_init_queue,
10415 ++ .elevator_exit_fn = bfq_exit_queue,
10416 ++ },
10417 ++ .icq_size = sizeof(struct bfq_io_cq),
10418 ++ .icq_align = __alignof__(struct bfq_io_cq),
10419 ++ .elevator_attrs = bfq_attrs,
10420 ++ .elevator_name = "bfq",
10421 ++ .elevator_owner = THIS_MODULE,
10422 ++};
10423 ++
10424 ++static int __init bfq_init(void)
10425 ++{
10426 ++ /*
10427 ++ * Can be 0 on HZ < 1000 setups.
10428 ++ */
10429 ++ if (bfq_slice_idle == 0)
10430 ++ bfq_slice_idle = 1;
10431 ++
10432 ++ if (bfq_timeout_async == 0)
10433 ++ bfq_timeout_async = 1;
10434 ++
10435 ++ if (bfq_slab_setup())
10436 ++ return -ENOMEM;
10437 ++
10438 ++ elv_register(&iosched_bfq);
10439 ++ pr_info("BFQ I/O-scheduler version: v7r2");
10440 ++
10441 ++ return 0;
10442 ++}
10443 ++
10444 ++static void __exit bfq_exit(void)
10445 ++{
10446 ++ elv_unregister(&iosched_bfq);
10447 ++ bfq_slab_kill();
10448 ++}
10449 ++
10450 ++module_init(bfq_init);
10451 ++module_exit(bfq_exit);
10452 ++
10453 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
10454 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
10455 +new file mode 100644
10456 +index 0000000..999b475
10457 +--- /dev/null
10458 ++++ b/block/bfq-sched.c
10459 +@@ -0,0 +1,1078 @@
10460 ++/*
10461 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
10462 ++ *
10463 ++ * Based on ideas and code from CFQ:
10464 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
10465 ++ *
10466 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
10467 ++ * Paolo Valente <paolo.valente@×××××××.it>
10468 ++ *
10469 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
10470 ++ */
10471 ++
10472 ++#ifdef CONFIG_CGROUP_BFQIO
10473 ++#define for_each_entity(entity) \
10474 ++ for (; entity != NULL; entity = entity->parent)
10475 ++
10476 ++#define for_each_entity_safe(entity, parent) \
10477 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
10478 ++
10479 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
10480 ++ int extract,
10481 ++ struct bfq_data *bfqd);
10482 ++
10483 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10484 ++{
10485 ++ struct bfq_entity *bfqg_entity;
10486 ++ struct bfq_group *bfqg;
10487 ++ struct bfq_sched_data *group_sd;
10488 ++
10489 ++ BUG_ON(next_in_service == NULL);
10490 ++
10491 ++ group_sd = next_in_service->sched_data;
10492 ++
10493 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
10494 ++ /*
10495 ++ * bfq_group's my_entity field is not NULL only if the group
10496 ++ * is not the root group. We must not touch the root entity
10497 ++ * as it must never become an in-service entity.
10498 ++ */
10499 ++ bfqg_entity = bfqg->my_entity;
10500 ++ if (bfqg_entity != NULL)
10501 ++ bfqg_entity->budget = next_in_service->budget;
10502 ++}
10503 ++
10504 ++static int bfq_update_next_in_service(struct bfq_sched_data *sd)
10505 ++{
10506 ++ struct bfq_entity *next_in_service;
10507 ++
10508 ++ if (sd->in_service_entity != NULL)
10509 ++ /* will update/requeue at the end of service */
10510 ++ return 0;
10511 ++
10512 ++ /*
10513 ++ * NOTE: this can be improved in many ways, such as returning
10514 ++ * 1 (and thus propagating upwards the update) only when the
10515 ++ * budget changes, or caching the bfqq that will be scheduled
10516 ++ * next from this subtree. By now we worry more about
10517 ++ * correctness than about performance...
10518 ++ */
10519 ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
10520 ++ sd->next_in_service = next_in_service;
10521 ++
10522 ++ if (next_in_service != NULL)
10523 ++ bfq_update_budget(next_in_service);
10524 ++
10525 ++ return 1;
10526 ++}
10527 ++
10528 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10529 ++ struct bfq_entity *entity)
10530 ++{
10531 ++ BUG_ON(sd->next_in_service != entity);
10532 ++}
10533 ++#else
10534 ++#define for_each_entity(entity) \
10535 ++ for (; entity != NULL; entity = NULL)
10536 ++
10537 ++#define for_each_entity_safe(entity, parent) \
10538 ++ for (parent = NULL; entity != NULL; entity = parent)
10539 ++
10540 ++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
10541 ++{
10542 ++ return 0;
10543 ++}
10544 ++
10545 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10546 ++ struct bfq_entity *entity)
10547 ++{
10548 ++}
10549 ++
10550 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10551 ++{
10552 ++}
10553 ++#endif
10554 ++
10555 ++/*
10556 ++ * Shift for timestamp calculations. This actually limits the maximum
10557 ++ * service allowed in one timestamp delta (small shift values increase it),
10558 ++ * the maximum total weight that can be used for the queues in the system
10559 ++ * (big shift values increase it), and the period of virtual time wraparounds.
10560 ++ */
10561 ++#define WFQ_SERVICE_SHIFT 22
10562 ++
10563 ++/**
10564 ++ * bfq_gt - compare two timestamps.
10565 ++ * @a: first ts.
10566 ++ * @b: second ts.
10567 ++ *
10568 ++ * Return @a > @b, dealing with wrapping correctly.
10569 ++ */
10570 ++static inline int bfq_gt(u64 a, u64 b)
10571 ++{
10572 ++ return (s64)(a - b) > 0;
10573 ++}
10574 ++
10575 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
10576 ++{
10577 ++ struct bfq_queue *bfqq = NULL;
10578 ++
10579 ++ BUG_ON(entity == NULL);
10580 ++
10581 ++ if (entity->my_sched_data == NULL)
10582 ++ bfqq = container_of(entity, struct bfq_queue, entity);
10583 ++
10584 ++ return bfqq;
10585 ++}
10586 ++
10587 ++
10588 ++/**
10589 ++ * bfq_delta - map service into the virtual time domain.
10590 ++ * @service: amount of service.
10591 ++ * @weight: scale factor (weight of an entity or weight sum).
10592 ++ */
10593 ++static inline u64 bfq_delta(unsigned long service,
10594 ++ unsigned long weight)
10595 ++{
10596 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
10597 ++
10598 ++ do_div(d, weight);
10599 ++ return d;
10600 ++}
10601 ++
10602 ++/**
10603 ++ * bfq_calc_finish - assign the finish time to an entity.
10604 ++ * @entity: the entity to act upon.
10605 ++ * @service: the service to be charged to the entity.
10606 ++ */
10607 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
10608 ++ unsigned long service)
10609 ++{
10610 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10611 ++
10612 ++ BUG_ON(entity->weight == 0);
10613 ++
10614 ++ entity->finish = entity->start +
10615 ++ bfq_delta(service, entity->weight);
10616 ++
10617 ++ if (bfqq != NULL) {
10618 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
10619 ++ "calc_finish: serv %lu, w %d",
10620 ++ service, entity->weight);
10621 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
10622 ++ "calc_finish: start %llu, finish %llu, delta %llu",
10623 ++ entity->start, entity->finish,
10624 ++ bfq_delta(service, entity->weight));
10625 ++ }
10626 ++}
10627 ++
10628 ++/**
10629 ++ * bfq_entity_of - get an entity from a node.
10630 ++ * @node: the node field of the entity.
10631 ++ *
10632 ++ * Convert a node pointer to the relative entity. This is used only
10633 ++ * to simplify the logic of some functions and not as the generic
10634 ++ * conversion mechanism because, e.g., in the tree walking functions,
10635 ++ * the check for a %NULL value would be redundant.
10636 ++ */
10637 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
10638 ++{
10639 ++ struct bfq_entity *entity = NULL;
10640 ++
10641 ++ if (node != NULL)
10642 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
10643 ++
10644 ++ return entity;
10645 ++}
10646 ++
10647 ++/**
10648 ++ * bfq_extract - remove an entity from a tree.
10649 ++ * @root: the tree root.
10650 ++ * @entity: the entity to remove.
10651 ++ */
10652 ++static inline void bfq_extract(struct rb_root *root,
10653 ++ struct bfq_entity *entity)
10654 ++{
10655 ++ BUG_ON(entity->tree != root);
10656 ++
10657 ++ entity->tree = NULL;
10658 ++ rb_erase(&entity->rb_node, root);
10659 ++}
10660 ++
10661 ++/**
10662 ++ * bfq_idle_extract - extract an entity from the idle tree.
10663 ++ * @st: the service tree of the owning @entity.
10664 ++ * @entity: the entity being removed.
10665 ++ */
10666 ++static void bfq_idle_extract(struct bfq_service_tree *st,
10667 ++ struct bfq_entity *entity)
10668 ++{
10669 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10670 ++ struct rb_node *next;
10671 ++
10672 ++ BUG_ON(entity->tree != &st->idle);
10673 ++
10674 ++ if (entity == st->first_idle) {
10675 ++ next = rb_next(&entity->rb_node);
10676 ++ st->first_idle = bfq_entity_of(next);
10677 ++ }
10678 ++
10679 ++ if (entity == st->last_idle) {
10680 ++ next = rb_prev(&entity->rb_node);
10681 ++ st->last_idle = bfq_entity_of(next);
10682 ++ }
10683 ++
10684 ++ bfq_extract(&st->idle, entity);
10685 ++
10686 ++ if (bfqq != NULL)
10687 ++ list_del(&bfqq->bfqq_list);
10688 ++}
10689 ++
10690 ++/**
10691 ++ * bfq_insert - generic tree insertion.
10692 ++ * @root: tree root.
10693 ++ * @entity: entity to insert.
10694 ++ *
10695 ++ * This is used for the idle and the active tree, since they are both
10696 ++ * ordered by finish time.
10697 ++ */
10698 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
10699 ++{
10700 ++ struct bfq_entity *entry;
10701 ++ struct rb_node **node = &root->rb_node;
10702 ++ struct rb_node *parent = NULL;
10703 ++
10704 ++ BUG_ON(entity->tree != NULL);
10705 ++
10706 ++ while (*node != NULL) {
10707 ++ parent = *node;
10708 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
10709 ++
10710 ++ if (bfq_gt(entry->finish, entity->finish))
10711 ++ node = &parent->rb_left;
10712 ++ else
10713 ++ node = &parent->rb_right;
10714 ++ }
10715 ++
10716 ++ rb_link_node(&entity->rb_node, parent, node);
10717 ++ rb_insert_color(&entity->rb_node, root);
10718 ++
10719 ++ entity->tree = root;
10720 ++}
10721 ++
10722 ++/**
10723 ++ * bfq_update_min - update the min_start field of a entity.
10724 ++ * @entity: the entity to update.
10725 ++ * @node: one of its children.
10726 ++ *
10727 ++ * This function is called when @entity may store an invalid value for
10728 ++ * min_start due to updates to the active tree. The function assumes
10729 ++ * that the subtree rooted at @node (which may be its left or its right
10730 ++ * child) has a valid min_start value.
10731 ++ */
10732 ++static inline void bfq_update_min(struct bfq_entity *entity,
10733 ++ struct rb_node *node)
10734 ++{
10735 ++ struct bfq_entity *child;
10736 ++
10737 ++ if (node != NULL) {
10738 ++ child = rb_entry(node, struct bfq_entity, rb_node);
10739 ++ if (bfq_gt(entity->min_start, child->min_start))
10740 ++ entity->min_start = child->min_start;
10741 ++ }
10742 ++}
10743 ++
10744 ++/**
10745 ++ * bfq_update_active_node - recalculate min_start.
10746 ++ * @node: the node to update.
10747 ++ *
10748 ++ * @node may have changed position or one of its children may have moved,
10749 ++ * this function updates its min_start value. The left and right subtrees
10750 ++ * are assumed to hold a correct min_start value.
10751 ++ */
10752 ++static inline void bfq_update_active_node(struct rb_node *node)
10753 ++{
10754 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
10755 ++
10756 ++ entity->min_start = entity->start;
10757 ++ bfq_update_min(entity, node->rb_right);
10758 ++ bfq_update_min(entity, node->rb_left);
10759 ++}
10760 ++
10761 ++/**
10762 ++ * bfq_update_active_tree - update min_start for the whole active tree.
10763 ++ * @node: the starting node.
10764 ++ *
10765 ++ * @node must be the deepest modified node after an update. This function
10766 ++ * updates its min_start using the values held by its children, assuming
10767 ++ * that they did not change, and then updates all the nodes that may have
10768 ++ * changed in the path to the root. The only nodes that may have changed
10769 ++ * are the ones in the path or their siblings.
10770 ++ */
10771 ++static void bfq_update_active_tree(struct rb_node *node)
10772 ++{
10773 ++ struct rb_node *parent;
10774 ++
10775 ++up:
10776 ++ bfq_update_active_node(node);
10777 ++
10778 ++ parent = rb_parent(node);
10779 ++ if (parent == NULL)
10780 ++ return;
10781 ++
10782 ++ if (node == parent->rb_left && parent->rb_right != NULL)
10783 ++ bfq_update_active_node(parent->rb_right);
10784 ++ else if (parent->rb_left != NULL)
10785 ++ bfq_update_active_node(parent->rb_left);
10786 ++
10787 ++ node = parent;
10788 ++ goto up;
10789 ++}
10790 ++
10791 ++/**
10792 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
10793 ++ * @st: the service tree of the entity.
10794 ++ * @entity: the entity being inserted.
10795 ++ *
10796 ++ * The active tree is ordered by finish time, but an extra key is kept
10797 ++ * per each node, containing the minimum value for the start times of
10798 ++ * its children (and the node itself), so it's possible to search for
10799 ++ * the eligible node with the lowest finish time in logarithmic time.
10800 ++ */
10801 ++static void bfq_active_insert(struct bfq_service_tree *st,
10802 ++ struct bfq_entity *entity)
10803 ++{
10804 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10805 ++ struct rb_node *node = &entity->rb_node;
10806 ++
10807 ++ bfq_insert(&st->active, entity);
10808 ++
10809 ++ if (node->rb_left != NULL)
10810 ++ node = node->rb_left;
10811 ++ else if (node->rb_right != NULL)
10812 ++ node = node->rb_right;
10813 ++
10814 ++ bfq_update_active_tree(node);
10815 ++
10816 ++ if (bfqq != NULL)
10817 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
10818 ++}
10819 ++
10820 ++/**
10821 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
10822 ++ * @ioprio: the ioprio value to convert.
10823 ++ */
10824 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
10825 ++{
10826 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
10827 ++ return IOPRIO_BE_NR - ioprio;
10828 ++}
10829 ++
10830 ++/**
10831 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
10832 ++ * @weight: the weight value to convert.
10833 ++ *
10834 ++ * To preserve as mush as possible the old only-ioprio user interface,
10835 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
10836 ++ * larger than IOPRIO_BE_NR
10837 ++ */
10838 ++static unsigned short bfq_weight_to_ioprio(int weight)
10839 ++{
10840 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
10841 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
10842 ++}
10843 ++
10844 ++static inline void bfq_get_entity(struct bfq_entity *entity)
10845 ++{
10846 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10847 ++ struct bfq_sched_data *sd;
10848 ++
10849 ++ if (bfqq != NULL) {
10850 ++ sd = entity->sched_data;
10851 ++ atomic_inc(&bfqq->ref);
10852 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
10853 ++ bfqq, atomic_read(&bfqq->ref));
10854 ++ }
10855 ++}
10856 ++
10857 ++/**
10858 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
10859 ++ * @node: the node being removed.
10860 ++ *
10861 ++ * Do the first step of an extraction in an rb tree, looking for the
10862 ++ * node that will replace @node, and returning the deepest node that
10863 ++ * the following modifications to the tree can touch. If @node is the
10864 ++ * last node in the tree return %NULL.
10865 ++ */
10866 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
10867 ++{
10868 ++ struct rb_node *deepest;
10869 ++
10870 ++ if (node->rb_right == NULL && node->rb_left == NULL)
10871 ++ deepest = rb_parent(node);
10872 ++ else if (node->rb_right == NULL)
10873 ++ deepest = node->rb_left;
10874 ++ else if (node->rb_left == NULL)
10875 ++ deepest = node->rb_right;
10876 ++ else {
10877 ++ deepest = rb_next(node);
10878 ++ if (deepest->rb_right != NULL)
10879 ++ deepest = deepest->rb_right;
10880 ++ else if (rb_parent(deepest) != node)
10881 ++ deepest = rb_parent(deepest);
10882 ++ }
10883 ++
10884 ++ return deepest;
10885 ++}
10886 ++
10887 ++/**
10888 ++ * bfq_active_extract - remove an entity from the active tree.
10889 ++ * @st: the service_tree containing the tree.
10890 ++ * @entity: the entity being removed.
10891 ++ */
10892 ++static void bfq_active_extract(struct bfq_service_tree *st,
10893 ++ struct bfq_entity *entity)
10894 ++{
10895 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10896 ++ struct rb_node *node;
10897 ++
10898 ++ node = bfq_find_deepest(&entity->rb_node);
10899 ++ bfq_extract(&st->active, entity);
10900 ++
10901 ++ if (node != NULL)
10902 ++ bfq_update_active_tree(node);
10903 ++
10904 ++ if (bfqq != NULL)
10905 ++ list_del(&bfqq->bfqq_list);
10906 ++}
10907 ++
10908 ++/**
10909 ++ * bfq_idle_insert - insert an entity into the idle tree.
10910 ++ * @st: the service tree containing the tree.
10911 ++ * @entity: the entity to insert.
10912 ++ */
10913 ++static void bfq_idle_insert(struct bfq_service_tree *st,
10914 ++ struct bfq_entity *entity)
10915 ++{
10916 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10917 ++ struct bfq_entity *first_idle = st->first_idle;
10918 ++ struct bfq_entity *last_idle = st->last_idle;
10919 ++
10920 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
10921 ++ st->first_idle = entity;
10922 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
10923 ++ st->last_idle = entity;
10924 ++
10925 ++ bfq_insert(&st->idle, entity);
10926 ++
10927 ++ if (bfqq != NULL)
10928 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
10929 ++}
10930 ++
10931 ++/**
10932 ++ * bfq_forget_entity - remove an entity from the wfq trees.
10933 ++ * @st: the service tree.
10934 ++ * @entity: the entity being removed.
10935 ++ *
10936 ++ * Update the device status and forget everything about @entity, putting
10937 ++ * the device reference to it, if it is a queue. Entities belonging to
10938 ++ * groups are not refcounted.
10939 ++ */
10940 ++static void bfq_forget_entity(struct bfq_service_tree *st,
10941 ++ struct bfq_entity *entity)
10942 ++{
10943 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10944 ++ struct bfq_sched_data *sd;
10945 ++
10946 ++ BUG_ON(!entity->on_st);
10947 ++
10948 ++ entity->on_st = 0;
10949 ++ st->wsum -= entity->weight;
10950 ++ if (bfqq != NULL) {
10951 ++ sd = entity->sched_data;
10952 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
10953 ++ bfqq, atomic_read(&bfqq->ref));
10954 ++ bfq_put_queue(bfqq);
10955 ++ }
10956 ++}
10957 ++
10958 ++/**
10959 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
10960 ++ * @st: service tree for the entity.
10961 ++ * @entity: the entity being released.
10962 ++ */
10963 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
10964 ++ struct bfq_entity *entity)
10965 ++{
10966 ++ bfq_idle_extract(st, entity);
10967 ++ bfq_forget_entity(st, entity);
10968 ++}
10969 ++
10970 ++/**
10971 ++ * bfq_forget_idle - update the idle tree if necessary.
10972 ++ * @st: the service tree to act upon.
10973 ++ *
10974 ++ * To preserve the global O(log N) complexity we only remove one entry here;
10975 ++ * as the idle tree will not grow indefinitely this can be done safely.
10976 ++ */
10977 ++static void bfq_forget_idle(struct bfq_service_tree *st)
10978 ++{
10979 ++ struct bfq_entity *first_idle = st->first_idle;
10980 ++ struct bfq_entity *last_idle = st->last_idle;
10981 ++
10982 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
10983 ++ !bfq_gt(last_idle->finish, st->vtime)) {
10984 ++ /*
10985 ++ * Forget the whole idle tree, increasing the vtime past
10986 ++ * the last finish time of idle entities.
10987 ++ */
10988 ++ st->vtime = last_idle->finish;
10989 ++ }
10990 ++
10991 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
10992 ++ bfq_put_idle_entity(st, first_idle);
10993 ++}
10994 ++
10995 ++static struct bfq_service_tree *
10996 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
10997 ++ struct bfq_entity *entity)
10998 ++{
10999 ++ struct bfq_service_tree *new_st = old_st;
11000 ++
11001 ++ if (entity->ioprio_changed) {
11002 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11003 ++
11004 ++ BUG_ON(old_st->wsum < entity->weight);
11005 ++ old_st->wsum -= entity->weight;
11006 ++
11007 ++ if (entity->new_weight != entity->orig_weight) {
11008 ++ entity->orig_weight = entity->new_weight;
11009 ++ entity->ioprio =
11010 ++ bfq_weight_to_ioprio(entity->orig_weight);
11011 ++ } else if (entity->new_ioprio != entity->ioprio) {
11012 ++ entity->ioprio = entity->new_ioprio;
11013 ++ entity->orig_weight =
11014 ++ bfq_ioprio_to_weight(entity->ioprio);
11015 ++ } else
11016 ++ entity->new_weight = entity->orig_weight =
11017 ++ bfq_ioprio_to_weight(entity->ioprio);
11018 ++
11019 ++ entity->ioprio_class = entity->new_ioprio_class;
11020 ++ entity->ioprio_changed = 0;
11021 ++
11022 ++ /*
11023 ++ * NOTE: here we may be changing the weight too early,
11024 ++ * this will cause unfairness. The correct approach
11025 ++ * would have required additional complexity to defer
11026 ++ * weight changes to the proper time instants (i.e.,
11027 ++ * when entity->finish <= old_st->vtime).
11028 ++ */
11029 ++ new_st = bfq_entity_service_tree(entity);
11030 ++ entity->weight = entity->orig_weight *
11031 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
11032 ++ new_st->wsum += entity->weight;
11033 ++
11034 ++ if (new_st != old_st)
11035 ++ entity->start = new_st->vtime;
11036 ++ }
11037 ++
11038 ++ return new_st;
11039 ++}
11040 ++
11041 ++/**
11042 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
11043 ++ * @bfqq: the queue being served.
11044 ++ * @served: bytes to transfer.
11045 ++ *
11046 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
11047 ++ * are synchronized every time a new bfqq is selected for service. By now,
11048 ++ * we keep it to better check consistency.
11049 ++ */
11050 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
11051 ++{
11052 ++ struct bfq_entity *entity = &bfqq->entity;
11053 ++ struct bfq_service_tree *st;
11054 ++
11055 ++ for_each_entity(entity) {
11056 ++ st = bfq_entity_service_tree(entity);
11057 ++
11058 ++ entity->service += served;
11059 ++ BUG_ON(entity->service > entity->budget);
11060 ++ BUG_ON(st->wsum == 0);
11061 ++
11062 ++ st->vtime += bfq_delta(served, st->wsum);
11063 ++ bfq_forget_idle(st);
11064 ++ }
11065 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
11066 ++}
11067 ++
11068 ++/**
11069 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
11070 ++ * @bfqq: the queue that needs a service update.
11071 ++ *
11072 ++ * When it's not possible to be fair in the service domain, because
11073 ++ * a queue is not consuming its budget fast enough (the meaning of
11074 ++ * fast depends on the timeout parameter), we charge it a full
11075 ++ * budget. In this way we should obtain a sort of time-domain
11076 ++ * fairness among all the seeky/slow queues.
11077 ++ */
11078 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
11079 ++{
11080 ++ struct bfq_entity *entity = &bfqq->entity;
11081 ++
11082 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
11083 ++
11084 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
11085 ++}
11086 ++
11087 ++/**
11088 ++ * __bfq_activate_entity - activate an entity.
11089 ++ * @entity: the entity being activated.
11090 ++ *
11091 ++ * Called whenever an entity is activated, i.e., it is not active and one
11092 ++ * of its children receives a new request, or has to be reactivated due to
11093 ++ * budget exhaustion. It uses the current budget of the entity (and the
11094 ++ * service received if @entity is active) of the queue to calculate its
11095 ++ * timestamps.
11096 ++ */
11097 ++static void __bfq_activate_entity(struct bfq_entity *entity)
11098 ++{
11099 ++ struct bfq_sched_data *sd = entity->sched_data;
11100 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11101 ++
11102 ++ if (entity == sd->in_service_entity) {
11103 ++ BUG_ON(entity->tree != NULL);
11104 ++ /*
11105 ++ * If we are requeueing the current entity we have
11106 ++ * to take care of not charging to it service it has
11107 ++ * not received.
11108 ++ */
11109 ++ bfq_calc_finish(entity, entity->service);
11110 ++ entity->start = entity->finish;
11111 ++ sd->in_service_entity = NULL;
11112 ++ } else if (entity->tree == &st->active) {
11113 ++ /*
11114 ++ * Requeueing an entity due to a change of some
11115 ++ * next_in_service entity below it. We reuse the
11116 ++ * old start time.
11117 ++ */
11118 ++ bfq_active_extract(st, entity);
11119 ++ } else if (entity->tree == &st->idle) {
11120 ++ /*
11121 ++ * Must be on the idle tree, bfq_idle_extract() will
11122 ++ * check for that.
11123 ++ */
11124 ++ bfq_idle_extract(st, entity);
11125 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
11126 ++ st->vtime : entity->finish;
11127 ++ } else {
11128 ++ /*
11129 ++ * The finish time of the entity may be invalid, and
11130 ++ * it is in the past for sure, otherwise the queue
11131 ++ * would have been on the idle tree.
11132 ++ */
11133 ++ entity->start = st->vtime;
11134 ++ st->wsum += entity->weight;
11135 ++ bfq_get_entity(entity);
11136 ++
11137 ++ BUG_ON(entity->on_st);
11138 ++ entity->on_st = 1;
11139 ++ }
11140 ++
11141 ++ st = __bfq_entity_update_weight_prio(st, entity);
11142 ++ bfq_calc_finish(entity, entity->budget);
11143 ++ bfq_active_insert(st, entity);
11144 ++}
11145 ++
11146 ++/**
11147 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
11148 ++ * @entity: the entity to activate.
11149 ++ *
11150 ++ * Activate @entity and all the entities on the path from it to the root.
11151 ++ */
11152 ++static void bfq_activate_entity(struct bfq_entity *entity)
11153 ++{
11154 ++ struct bfq_sched_data *sd;
11155 ++
11156 ++ for_each_entity(entity) {
11157 ++ __bfq_activate_entity(entity);
11158 ++
11159 ++ sd = entity->sched_data;
11160 ++ if (!bfq_update_next_in_service(sd))
11161 ++ /*
11162 ++ * No need to propagate the activation to the
11163 ++ * upper entities, as they will be updated when
11164 ++ * the in-service entity is rescheduled.
11165 ++ */
11166 ++ break;
11167 ++ }
11168 ++}
11169 ++
11170 ++/**
11171 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
11172 ++ * @entity: the entity to deactivate.
11173 ++ * @requeue: if false, the entity will not be put into the idle tree.
11174 ++ *
11175 ++ * Deactivate an entity, independently from its previous state. If the
11176 ++ * entity was not on a service tree just return, otherwise if it is on
11177 ++ * any scheduler tree, extract it from that tree, and if necessary
11178 ++ * and if the caller did not specify @requeue, put it on the idle tree.
11179 ++ *
11180 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
11181 ++ * if the entity was under service or if it was the next_in_service for
11182 ++ * its sched_data; return %0 otherwise.
11183 ++ */
11184 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11185 ++{
11186 ++ struct bfq_sched_data *sd = entity->sched_data;
11187 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11188 ++ int was_in_service = entity == sd->in_service_entity;
11189 ++ int ret = 0;
11190 ++
11191 ++ if (!entity->on_st)
11192 ++ return 0;
11193 ++
11194 ++ BUG_ON(was_in_service && entity->tree != NULL);
11195 ++
11196 ++ if (was_in_service) {
11197 ++ bfq_calc_finish(entity, entity->service);
11198 ++ sd->in_service_entity = NULL;
11199 ++ } else if (entity->tree == &st->active)
11200 ++ bfq_active_extract(st, entity);
11201 ++ else if (entity->tree == &st->idle)
11202 ++ bfq_idle_extract(st, entity);
11203 ++ else if (entity->tree != NULL)
11204 ++ BUG();
11205 ++
11206 ++ if (was_in_service || sd->next_in_service == entity)
11207 ++ ret = bfq_update_next_in_service(sd);
11208 ++
11209 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
11210 ++ bfq_forget_entity(st, entity);
11211 ++ else
11212 ++ bfq_idle_insert(st, entity);
11213 ++
11214 ++ BUG_ON(sd->in_service_entity == entity);
11215 ++ BUG_ON(sd->next_in_service == entity);
11216 ++
11217 ++ return ret;
11218 ++}
11219 ++
11220 ++/**
11221 ++ * bfq_deactivate_entity - deactivate an entity.
11222 ++ * @entity: the entity to deactivate.
11223 ++ * @requeue: true if the entity can be put on the idle tree
11224 ++ */
11225 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11226 ++{
11227 ++ struct bfq_sched_data *sd;
11228 ++ struct bfq_entity *parent;
11229 ++
11230 ++ for_each_entity_safe(entity, parent) {
11231 ++ sd = entity->sched_data;
11232 ++
11233 ++ if (!__bfq_deactivate_entity(entity, requeue))
11234 ++ /*
11235 ++ * The parent entity is still backlogged, and
11236 ++ * we don't need to update it as it is still
11237 ++ * under service.
11238 ++ */
11239 ++ break;
11240 ++
11241 ++ if (sd->next_in_service != NULL)
11242 ++ /*
11243 ++ * The parent entity is still backlogged and
11244 ++ * the budgets on the path towards the root
11245 ++ * need to be updated.
11246 ++ */
11247 ++ goto update;
11248 ++
11249 ++ /*
11250 ++ * If we reach there the parent is no more backlogged and
11251 ++ * we want to propagate the dequeue upwards.
11252 ++ */
11253 ++ requeue = 1;
11254 ++ }
11255 ++
11256 ++ return;
11257 ++
11258 ++update:
11259 ++ entity = parent;
11260 ++ for_each_entity(entity) {
11261 ++ __bfq_activate_entity(entity);
11262 ++
11263 ++ sd = entity->sched_data;
11264 ++ if (!bfq_update_next_in_service(sd))
11265 ++ break;
11266 ++ }
11267 ++}
11268 ++
11269 ++/**
11270 ++ * bfq_update_vtime - update vtime if necessary.
11271 ++ * @st: the service tree to act upon.
11272 ++ *
11273 ++ * If necessary update the service tree vtime to have at least one
11274 ++ * eligible entity, skipping to its start time. Assumes that the
11275 ++ * active tree of the device is not empty.
11276 ++ *
11277 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
11278 ++ * we may end up with reactivated tasks getting timestamps after a
11279 ++ * vtime skip done because we needed a ->first_active entity on some
11280 ++ * intermediate node.
11281 ++ */
11282 ++static void bfq_update_vtime(struct bfq_service_tree *st)
11283 ++{
11284 ++ struct bfq_entity *entry;
11285 ++ struct rb_node *node = st->active.rb_node;
11286 ++
11287 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
11288 ++ if (bfq_gt(entry->min_start, st->vtime)) {
11289 ++ st->vtime = entry->min_start;
11290 ++ bfq_forget_idle(st);
11291 ++ }
11292 ++}
11293 ++
11294 ++/**
11295 ++ * bfq_first_active_entity - find the eligible entity with
11296 ++ * the smallest finish time
11297 ++ * @st: the service tree to select from.
11298 ++ *
11299 ++ * This function searches the first schedulable entity, starting from the
11300 ++ * root of the tree and going on the left every time on this side there is
11301 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
11302 ++ * on the right is followed only if a) the left subtree contains no eligible
11303 ++ * entities and b) no eligible entity has been found yet.
11304 ++ */
11305 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
11306 ++{
11307 ++ struct bfq_entity *entry, *first = NULL;
11308 ++ struct rb_node *node = st->active.rb_node;
11309 ++
11310 ++ while (node != NULL) {
11311 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
11312 ++left:
11313 ++ if (!bfq_gt(entry->start, st->vtime))
11314 ++ first = entry;
11315 ++
11316 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
11317 ++
11318 ++ if (node->rb_left != NULL) {
11319 ++ entry = rb_entry(node->rb_left,
11320 ++ struct bfq_entity, rb_node);
11321 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
11322 ++ node = node->rb_left;
11323 ++ goto left;
11324 ++ }
11325 ++ }
11326 ++ if (first != NULL)
11327 ++ break;
11328 ++ node = node->rb_right;
11329 ++ }
11330 ++
11331 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
11332 ++ return first;
11333 ++}
11334 ++
11335 ++/**
11336 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
11337 ++ * @st: the service tree.
11338 ++ *
11339 ++ * Update the virtual time in @st and return the first eligible entity
11340 ++ * it contains.
11341 ++ */
11342 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
11343 ++ bool force)
11344 ++{
11345 ++ struct bfq_entity *entity, *new_next_in_service = NULL;
11346 ++
11347 ++ if (RB_EMPTY_ROOT(&st->active))
11348 ++ return NULL;
11349 ++
11350 ++ bfq_update_vtime(st);
11351 ++ entity = bfq_first_active_entity(st);
11352 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
11353 ++
11354 ++ /*
11355 ++ * If the chosen entity does not match with the sched_data's
11356 ++ * next_in_service and we are forcedly serving the IDLE priority
11357 ++ * class tree, bubble up budget update.
11358 ++ */
11359 ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
11360 ++ new_next_in_service = entity;
11361 ++ for_each_entity(new_next_in_service)
11362 ++ bfq_update_budget(new_next_in_service);
11363 ++ }
11364 ++
11365 ++ return entity;
11366 ++}
11367 ++
11368 ++/**
11369 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
11370 ++ * @sd: the sched_data.
11371 ++ * @extract: if true the returned entity will be also extracted from @sd.
11372 ++ *
11373 ++ * NOTE: since we cache the next_in_service entity at each level of the
11374 ++ * hierarchy, the complexity of the lookup can be decreased with
11375 ++ * absolutely no effort just returning the cached next_in_service value;
11376 ++ * we prefer to do full lookups to test the consistency of * the data
11377 ++ * structures.
11378 ++ */
11379 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
11380 ++ int extract,
11381 ++ struct bfq_data *bfqd)
11382 ++{
11383 ++ struct bfq_service_tree *st = sd->service_tree;
11384 ++ struct bfq_entity *entity;
11385 ++ int i = 0;
11386 ++
11387 ++ BUG_ON(sd->in_service_entity != NULL);
11388 ++
11389 ++ if (bfqd != NULL &&
11390 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
11391 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
11392 ++ true);
11393 ++ if (entity != NULL) {
11394 ++ i = BFQ_IOPRIO_CLASSES - 1;
11395 ++ bfqd->bfq_class_idle_last_service = jiffies;
11396 ++ sd->next_in_service = entity;
11397 ++ }
11398 ++ }
11399 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
11400 ++ entity = __bfq_lookup_next_entity(st + i, false);
11401 ++ if (entity != NULL) {
11402 ++ if (extract) {
11403 ++ bfq_check_next_in_service(sd, entity);
11404 ++ bfq_active_extract(st + i, entity);
11405 ++ sd->in_service_entity = entity;
11406 ++ sd->next_in_service = NULL;
11407 ++ }
11408 ++ break;
11409 ++ }
11410 ++ }
11411 ++
11412 ++ return entity;
11413 ++}
11414 ++
11415 ++/*
11416 ++ * Get next queue for service.
11417 ++ */
11418 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
11419 ++{
11420 ++ struct bfq_entity *entity = NULL;
11421 ++ struct bfq_sched_data *sd;
11422 ++ struct bfq_queue *bfqq;
11423 ++
11424 ++ BUG_ON(bfqd->in_service_queue != NULL);
11425 ++
11426 ++ if (bfqd->busy_queues == 0)
11427 ++ return NULL;
11428 ++
11429 ++ sd = &bfqd->root_group->sched_data;
11430 ++ for (; sd != NULL; sd = entity->my_sched_data) {
11431 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
11432 ++ BUG_ON(entity == NULL);
11433 ++ entity->service = 0;
11434 ++ }
11435 ++
11436 ++ bfqq = bfq_entity_to_bfqq(entity);
11437 ++ BUG_ON(bfqq == NULL);
11438 ++
11439 ++ return bfqq;
11440 ++}
11441 ++
11442 ++/*
11443 ++ * Forced extraction of the given queue.
11444 ++ */
11445 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
11446 ++ struct bfq_queue *bfqq)
11447 ++{
11448 ++ struct bfq_entity *entity;
11449 ++ struct bfq_sched_data *sd;
11450 ++
11451 ++ BUG_ON(bfqd->in_service_queue != NULL);
11452 ++
11453 ++ entity = &bfqq->entity;
11454 ++ /*
11455 ++ * Bubble up extraction/update from the leaf to the root.
11456 ++ */
11457 ++ for_each_entity(entity) {
11458 ++ sd = entity->sched_data;
11459 ++ bfq_update_budget(entity);
11460 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
11461 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
11462 ++ sd->active_entity = entity;
11463 ++ sd->next_active = NULL;
11464 ++ entity->service = 0;
11465 ++ }
11466 ++
11467 ++ return;
11468 ++}
11469 ++
11470 ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
11471 ++{
11472 ++ if (bfqd->in_service_bic != NULL) {
11473 ++ put_io_context(bfqd->in_service_bic->icq.ioc);
11474 ++ bfqd->in_service_bic = NULL;
11475 ++ }
11476 ++
11477 ++ bfqd->in_service_queue = NULL;
11478 ++ del_timer(&bfqd->idle_slice_timer);
11479 ++}
11480 ++
11481 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11482 ++ int requeue)
11483 ++{
11484 ++ struct bfq_entity *entity = &bfqq->entity;
11485 ++
11486 ++ if (bfqq == bfqd->in_service_queue)
11487 ++ __bfq_bfqd_reset_in_service(bfqd);
11488 ++
11489 ++ bfq_deactivate_entity(entity, requeue);
11490 ++}
11491 ++
11492 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11493 ++{
11494 ++ struct bfq_entity *entity = &bfqq->entity;
11495 ++
11496 ++ bfq_activate_entity(entity);
11497 ++}
11498 ++
11499 ++/*
11500 ++ * Called when the bfqq no longer has requests pending, remove it from
11501 ++ * the service tree.
11502 ++ */
11503 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11504 ++ int requeue)
11505 ++{
11506 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
11507 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
11508 ++
11509 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
11510 ++
11511 ++ bfq_clear_bfqq_busy(bfqq);
11512 ++
11513 ++ BUG_ON(bfqd->busy_queues == 0);
11514 ++ bfqd->busy_queues--;
11515 ++ if (bfqq->raising_coeff > 1)
11516 ++ bfqd->raised_busy_queues--;
11517 ++
11518 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
11519 ++}
11520 ++
11521 ++/*
11522 ++ * Called when an inactive queue receives a new request.
11523 ++ */
11524 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11525 ++{
11526 ++ BUG_ON(bfq_bfqq_busy(bfqq));
11527 ++ BUG_ON(bfqq == bfqd->in_service_queue);
11528 ++
11529 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
11530 ++
11531 ++ bfq_activate_bfqq(bfqd, bfqq);
11532 ++
11533 ++ bfq_mark_bfqq_busy(bfqq);
11534 ++ bfqd->busy_queues++;
11535 ++ if (bfqq->raising_coeff > 1)
11536 ++ bfqd->raised_busy_queues++;
11537 ++}
11538 +diff --git a/block/bfq.h b/block/bfq.h
11539 +new file mode 100644
11540 +index 0000000..3ca8482
11541 +--- /dev/null
11542 ++++ b/block/bfq.h
11543 +@@ -0,0 +1,622 @@
11544 ++/*
11545 ++ * BFQ-v7r2 for 3.14.0: data structures and common functions prototypes.
11546 ++ *
11547 ++ * Based on ideas and code from CFQ:
11548 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
11549 ++ *
11550 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
11551 ++ * Paolo Valente <paolo.valente@×××××××.it>
11552 ++ *
11553 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
11554 ++ */
11555 ++
11556 ++#ifndef _BFQ_H
11557 ++#define _BFQ_H
11558 ++
11559 ++#include <linux/blktrace_api.h>
11560 ++#include <linux/hrtimer.h>
11561 ++#include <linux/ioprio.h>
11562 ++#include <linux/rbtree.h>
11563 ++
11564 ++#define BFQ_IOPRIO_CLASSES 3
11565 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
11566 ++
11567 ++#define BFQ_MIN_WEIGHT 1
11568 ++#define BFQ_MAX_WEIGHT 1000
11569 ++
11570 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
11571 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
11572 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
11573 ++
11574 ++struct bfq_entity;
11575 ++
11576 ++/**
11577 ++ * struct bfq_service_tree - per ioprio_class service tree.
11578 ++ * @active: tree for active entities (i.e., those backlogged).
11579 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
11580 ++ * @first_idle: idle entity with minimum F_i.
11581 ++ * @last_idle: idle entity with maximum F_i.
11582 ++ * @vtime: scheduler virtual time.
11583 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
11584 ++ *
11585 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
11586 ++ * ioprio_class has its own independent scheduler, and so its own
11587 ++ * bfq_service_tree. All the fields are protected by the queue lock
11588 ++ * of the containing bfqd.
11589 ++ */
11590 ++struct bfq_service_tree {
11591 ++ struct rb_root active;
11592 ++ struct rb_root idle;
11593 ++
11594 ++ struct bfq_entity *first_idle;
11595 ++ struct bfq_entity *last_idle;
11596 ++
11597 ++ u64 vtime;
11598 ++ unsigned long wsum;
11599 ++};
11600 ++
11601 ++/**
11602 ++ * struct bfq_sched_data - multi-class scheduler.
11603 ++ * @in_service_entity: entity under service.
11604 ++ * @next_in_service: head-of-the-line entity in the scheduler.
11605 ++ * @service_tree: array of service trees, one per ioprio_class.
11606 ++ *
11607 ++ * bfq_sched_data is the basic scheduler queue. It supports three
11608 ++ * ioprio_classes, and can be used either as a toplevel queue or as
11609 ++ * an intermediate queue on a hierarchical setup.
11610 ++ * @next_in_service points to the active entity of the sched_data
11611 ++ * service trees that will be scheduled next.
11612 ++ *
11613 ++ * The supported ioprio_classes are the same as in CFQ, in descending
11614 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
11615 ++ * Requests from higher priority queues are served before all the
11616 ++ * requests from lower priority queues; among requests of the same
11617 ++ * queue requests are served according to B-WF2Q+.
11618 ++ * All the fields are protected by the queue lock of the containing bfqd.
11619 ++ */
11620 ++struct bfq_sched_data {
11621 ++ struct bfq_entity *in_service_entity;
11622 ++ struct bfq_entity *next_in_service;
11623 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
11624 ++};
11625 ++
11626 ++/**
11627 ++ * struct bfq_entity - schedulable entity.
11628 ++ * @rb_node: service_tree member.
11629 ++ * @on_st: flag, true if the entity is on a tree (either the active or
11630 ++ * the idle one of its service_tree).
11631 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
11632 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
11633 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
11634 ++ * @min_start: minimum start time of the (active) subtree rooted at
11635 ++ * this entity; used for O(log N) lookups into active trees.
11636 ++ * @service: service received during the last round of service.
11637 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
11638 ++ * @weight: weight of the queue
11639 ++ * @parent: parent entity, for hierarchical scheduling.
11640 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
11641 ++ * associated scheduler queue, %NULL on leaf nodes.
11642 ++ * @sched_data: the scheduler queue this entity belongs to.
11643 ++ * @ioprio: the ioprio in use.
11644 ++ * @new_weight: when a weight change is requested, the new weight value.
11645 ++ * @orig_weight: original weight, used to implement weight boosting
11646 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
11647 ++ * @ioprio_class: the ioprio_class in use.
11648 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
11649 ++ * ioprio_class value.
11650 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
11651 ++ * ioprio_class change.
11652 ++ *
11653 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
11654 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
11655 ++ * entity belongs to the sched_data of the parent group in the cgroup
11656 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
11657 ++ * in @my_sched_data.
11658 ++ *
11659 ++ * Each entity stores independently its priority values; this would
11660 ++ * allow different weights on different devices, but this
11661 ++ * functionality is not exported to userspace by now. Priorities and
11662 ++ * weights are updated lazily, first storing the new values into the
11663 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
11664 ++ * there is a transition in the entity state that allows the priority
11665 ++ * update to take place the effective and the requested priority
11666 ++ * values are synchronized.
11667 ++ *
11668 ++ * Unless cgroups are used, the weight value is calculated from the
11669 ++ * ioprio to export the same interface as CFQ. When dealing with
11670 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
11671 ++ * time to consume their budget and have true sequential behavior, and
11672 ++ * when there are no external factors breaking anticipation) the
11673 ++ * relative weights at each level of the cgroups hierarchy should be
11674 ++ * guaranteed. All the fields are protected by the queue lock of the
11675 ++ * containing bfqd.
11676 ++ */
11677 ++struct bfq_entity {
11678 ++ struct rb_node rb_node;
11679 ++
11680 ++ int on_st;
11681 ++
11682 ++ u64 finish;
11683 ++ u64 start;
11684 ++
11685 ++ struct rb_root *tree;
11686 ++
11687 ++ u64 min_start;
11688 ++
11689 ++ unsigned long service, budget;
11690 ++ unsigned short weight, new_weight;
11691 ++ unsigned short orig_weight;
11692 ++
11693 ++ struct bfq_entity *parent;
11694 ++
11695 ++ struct bfq_sched_data *my_sched_data;
11696 ++ struct bfq_sched_data *sched_data;
11697 ++
11698 ++ unsigned short ioprio, new_ioprio;
11699 ++ unsigned short ioprio_class, new_ioprio_class;
11700 ++
11701 ++ int ioprio_changed;
11702 ++};
11703 ++
11704 ++struct bfq_group;
11705 ++
11706 ++/**
11707 ++ * struct bfq_queue - leaf schedulable entity.
11708 ++ * @ref: reference counter.
11709 ++ * @bfqd: parent bfq_data.
11710 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
11711 ++ * one or more other queues.
11712 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
11713 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
11714 ++ * @sort_list: sorted list of pending requests.
11715 ++ * @next_rq: if fifo isn't expired, next request to serve.
11716 ++ * @queued: nr of requests queued in @sort_list.
11717 ++ * @allocated: currently allocated requests.
11718 ++ * @meta_pending: pending metadata requests.
11719 ++ * @fifo: fifo list of requests in sort_list.
11720 ++ * @entity: entity representing this queue in the scheduler.
11721 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
11722 ++ * @budget_timeout: budget expiration (in jiffies).
11723 ++ * @dispatched: number of requests on the dispatch list or inside driver.
11724 ++ * @org_ioprio: saved ioprio during boosted periods.
11725 ++ * @flags: status flags.
11726 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
11727 ++ * @seek_samples: number of seeks sampled
11728 ++ * @seek_total: sum of the distances of the seeks sampled
11729 ++ * @seek_mean: mean seek distance
11730 ++ * @last_request_pos: position of the last request enqueued
11731 ++ * @pid: pid of the process owning the queue, used for logging purposes.
11732 ++ * @last_rais_start_finish: start time of the current weight-raising period if
11733 ++ * the @bfq-queue is being weight-raised, otherwise
11734 ++ * finish time of the last weight-raising period
11735 ++ * @raising_cur_max_time: current max raising time for this queue
11736 ++ * @soft_rt_next_start: minimum time instant such that, only if a new request
11737 ++ * is enqueued after this time instant in an idle
11738 ++ * @bfq_queue with no outstanding requests, then the
11739 ++ * task associated with the queue it is deemed as soft
11740 ++ * real-time (see the comments to the function
11741 ++ * bfq_bfqq_softrt_next_start())
11742 ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
11743 ++ * idle to backlogged
11744 ++ * @service_from_backlogged: cumulative service received from the @bfq_queue
11745 ++ * since the last transition from idle to backlogged
11746 ++ *
11747 ++ * A bfq_queue is a leaf request queue; it can be associated with an io_context
11748 ++ * or more, if it is async or shared between cooperating processes. @cgroup
11749 ++ * holds a reference to the cgroup, to be sure that it does not disappear while
11750 ++ * a bfqq still references it (mostly to avoid races between request issuing and
11751 ++ * task migration followed by cgroup destruction).
11752 ++ * All the fields are protected by the queue lock of the containing bfqd.
11753 ++ */
11754 ++struct bfq_queue {
11755 ++ atomic_t ref;
11756 ++ struct bfq_data *bfqd;
11757 ++
11758 ++ /* fields for cooperating queues handling */
11759 ++ struct bfq_queue *new_bfqq;
11760 ++ struct rb_node pos_node;
11761 ++ struct rb_root *pos_root;
11762 ++
11763 ++ struct rb_root sort_list;
11764 ++ struct request *next_rq;
11765 ++ int queued[2];
11766 ++ int allocated[2];
11767 ++ int meta_pending;
11768 ++ struct list_head fifo;
11769 ++
11770 ++ struct bfq_entity entity;
11771 ++
11772 ++ unsigned long max_budget;
11773 ++ unsigned long budget_timeout;
11774 ++
11775 ++ int dispatched;
11776 ++
11777 ++ unsigned short org_ioprio;
11778 ++
11779 ++ unsigned int flags;
11780 ++
11781 ++ struct list_head bfqq_list;
11782 ++
11783 ++ unsigned int seek_samples;
11784 ++ u64 seek_total;
11785 ++ sector_t seek_mean;
11786 ++ sector_t last_request_pos;
11787 ++
11788 ++ pid_t pid;
11789 ++
11790 ++ /* weight-raising fields */
11791 ++ unsigned long raising_cur_max_time;
11792 ++ unsigned long soft_rt_next_start;
11793 ++ unsigned long last_rais_start_finish;
11794 ++ unsigned int raising_coeff;
11795 ++ unsigned long last_idle_bklogged;
11796 ++ unsigned long service_from_backlogged;
11797 ++};
11798 ++
11799 ++/**
11800 ++ * struct bfq_ttime - per process thinktime stats.
11801 ++ * @ttime_total: total process thinktime
11802 ++ * @ttime_samples: number of thinktime samples
11803 ++ * @ttime_mean: average process thinktime
11804 ++ */
11805 ++struct bfq_ttime {
11806 ++ unsigned long last_end_request;
11807 ++
11808 ++ unsigned long ttime_total;
11809 ++ unsigned long ttime_samples;
11810 ++ unsigned long ttime_mean;
11811 ++};
11812 ++
11813 ++/**
11814 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
11815 ++ * @icq: associated io_cq structure
11816 ++ * @bfqq: array of two process queues, the sync and the async
11817 ++ * @ttime: associated @bfq_ttime struct
11818 ++ */
11819 ++struct bfq_io_cq {
11820 ++ struct io_cq icq; /* must be the first member */
11821 ++ struct bfq_queue *bfqq[2];
11822 ++ struct bfq_ttime ttime;
11823 ++ int ioprio;
11824 ++};
11825 ++
11826 ++/**
11827 ++ * struct bfq_data - per device data structure.
11828 ++ * @queue: request queue for the managed device.
11829 ++ * @root_group: root bfq_group for the device.
11830 ++ * @rq_pos_tree: rbtree sorted by next_request position,
11831 ++ * used when determining if two or more queues
11832 ++ * have interleaving requests (see bfq_close_cooperator).
11833 ++ * @busy_queues: number of bfq_queues containing requests (including the
11834 ++ * queue under service, even if it is idling).
11835 ++ * @raised_busy_queues: number of weight-raised busy bfq_queues.
11836 ++ * @queued: number of queued requests.
11837 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
11838 ++ * @sync_flight: number of sync requests in the driver.
11839 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
11840 ++ * completed requests .
11841 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
11842 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
11843 ++ * @budgets_assigned: number of budgets assigned.
11844 ++ * @idle_slice_timer: timer set when idling for the next sequential request
11845 ++ * from the queue under service.
11846 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
11847 ++ * @in_service_queue: bfq_queue under service.
11848 ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
11849 ++ * @last_position: on-disk position of the last served request.
11850 ++ * @last_budget_start: beginning of the last budget.
11851 ++ * @last_idling_start: beginning of the last idle slice.
11852 ++ * @peak_rate: peak transfer rate observed for a budget.
11853 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
11854 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
11855 ++ * @group_list: list of all the bfq_groups active on the device.
11856 ++ * @active_list: list of all the bfq_queues active on the device.
11857 ++ * @idle_list: list of all the bfq_queues idle on the device.
11858 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
11859 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
11860 ++ * requests are served in fifo order.
11861 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
11862 ++ * @bfq_back_max: maximum allowed backward seek.
11863 ++ * @bfq_slice_idle: maximum idling time.
11864 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
11865 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
11866 ++ * async queues.
11867 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
11868 ++ * to prevent seeky queues to impose long latencies to well
11869 ++ * behaved ones (this also implies that seeky queues cannot
11870 ++ * receive guarantees in the service domain; after a timeout
11871 ++ * they are charged for the whole allocated budget, to try
11872 ++ * to preserve a behavior reasonably fair among them, but
11873 ++ * without service-domain guarantees).
11874 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
11875 ++ * queue is multiplied
11876 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
11877 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
11878 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
11879 ++ * may be reactivated for a queue (in jiffies)
11880 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
11881 ++ * after which weight-raising may be
11882 ++ * reactivated for an already busy queue
11883 ++ * (in jiffies)
11884 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
11885 ++ * sectors per seconds
11886 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
11887 ++ * duration of the weight raising automatically
11888 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
11889 ++ *
11890 ++ * All the fields are protected by the @queue lock.
11891 ++ */
11892 ++struct bfq_data {
11893 ++ struct request_queue *queue;
11894 ++
11895 ++ struct bfq_group *root_group;
11896 ++
11897 ++ struct rb_root rq_pos_tree;
11898 ++
11899 ++ int busy_queues;
11900 ++ int raised_busy_queues;
11901 ++ int queued;
11902 ++ int rq_in_driver;
11903 ++ int sync_flight;
11904 ++
11905 ++ int max_rq_in_driver;
11906 ++ int hw_tag_samples;
11907 ++ int hw_tag;
11908 ++
11909 ++ int budgets_assigned;
11910 ++
11911 ++ struct timer_list idle_slice_timer;
11912 ++ struct work_struct unplug_work;
11913 ++
11914 ++ struct bfq_queue *in_service_queue;
11915 ++ struct bfq_io_cq *in_service_bic;
11916 ++
11917 ++ sector_t last_position;
11918 ++
11919 ++ ktime_t last_budget_start;
11920 ++ ktime_t last_idling_start;
11921 ++ int peak_rate_samples;
11922 ++ u64 peak_rate;
11923 ++ unsigned long bfq_max_budget;
11924 ++
11925 ++ struct hlist_head group_list;
11926 ++ struct list_head active_list;
11927 ++ struct list_head idle_list;
11928 ++
11929 ++ unsigned int bfq_quantum;
11930 ++ unsigned int bfq_fifo_expire[2];
11931 ++ unsigned int bfq_back_penalty;
11932 ++ unsigned int bfq_back_max;
11933 ++ unsigned int bfq_slice_idle;
11934 ++ u64 bfq_class_idle_last_service;
11935 ++
11936 ++ unsigned int bfq_user_max_budget;
11937 ++ unsigned int bfq_max_budget_async_rq;
11938 ++ unsigned int bfq_timeout[2];
11939 ++
11940 ++ bool low_latency;
11941 ++
11942 ++ /* parameters of the low_latency heuristics */
11943 ++ unsigned int bfq_raising_coeff;
11944 ++ unsigned int bfq_raising_max_time;
11945 ++ unsigned int bfq_raising_rt_max_time;
11946 ++ unsigned int bfq_raising_min_idle_time;
11947 ++ unsigned long bfq_raising_min_inter_arr_async;
11948 ++ unsigned int bfq_raising_max_softrt_rate;
11949 ++ u64 RT_prod;
11950 ++
11951 ++ struct bfq_queue oom_bfqq;
11952 ++};
11953 ++
11954 ++enum bfqq_state_flags {
11955 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
11956 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
11957 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
11958 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
11959 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
11960 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
11961 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
11962 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
11963 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
11964 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
11965 ++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
11966 ++};
11967 ++
11968 ++#define BFQ_BFQQ_FNS(name) \
11969 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
11970 ++{ \
11971 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
11972 ++} \
11973 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
11974 ++{ \
11975 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
11976 ++} \
11977 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
11978 ++{ \
11979 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
11980 ++}
11981 ++
11982 ++BFQ_BFQQ_FNS(busy);
11983 ++BFQ_BFQQ_FNS(wait_request);
11984 ++BFQ_BFQQ_FNS(must_alloc);
11985 ++BFQ_BFQQ_FNS(fifo_expire);
11986 ++BFQ_BFQQ_FNS(idle_window);
11987 ++BFQ_BFQQ_FNS(prio_changed);
11988 ++BFQ_BFQQ_FNS(sync);
11989 ++BFQ_BFQQ_FNS(budget_new);
11990 ++BFQ_BFQQ_FNS(coop);
11991 ++BFQ_BFQQ_FNS(split_coop);
11992 ++BFQ_BFQQ_FNS(softrt_update);
11993 ++#undef BFQ_BFQQ_FNS
11994 ++
11995 ++/* Logging facilities. */
11996 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
11997 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
11998 ++
11999 ++#define bfq_log(bfqd, fmt, args...) \
12000 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
12001 ++
12002 ++/* Expiration reasons. */
12003 ++enum bfqq_expiration {
12004 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
12005 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
12006 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
12007 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
12008 ++};
12009 ++
12010 ++#ifdef CONFIG_CGROUP_BFQIO
12011 ++/**
12012 ++ * struct bfq_group - per (device, cgroup) data structure.
12013 ++ * @entity: schedulable entity to insert into the parent group sched_data.
12014 ++ * @sched_data: own sched_data, to contain child entities (they may be
12015 ++ * both bfq_queues and bfq_groups).
12016 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
12017 ++ * list of the containing cgroup's bfqio_cgroup.
12018 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
12019 ++ * of the groups active on the same device; used for cleanup.
12020 ++ * @bfqd: the bfq_data for the device this group acts upon.
12021 ++ * @async_bfqq: array of async queues for all the tasks belonging to
12022 ++ * the group, one queue per ioprio value per ioprio_class,
12023 ++ * except for the idle class that has only one queue.
12024 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
12025 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
12026 ++ * to avoid too many special cases during group creation/migration.
12027 ++ *
12028 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
12029 ++ * there is a set of bfq_groups, each one collecting the lower-level
12030 ++ * entities belonging to the group that are acting on the same device.
12031 ++ *
12032 ++ * Locking works as follows:
12033 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
12034 ++ * via RCU from its readers.
12035 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
12036 ++ * from the readers.
12037 ++ * o All the other fields are protected by the @bfqd queue lock.
12038 ++ */
12039 ++struct bfq_group {
12040 ++ struct bfq_entity entity;
12041 ++ struct bfq_sched_data sched_data;
12042 ++
12043 ++ struct hlist_node group_node;
12044 ++ struct hlist_node bfqd_node;
12045 ++
12046 ++ void *bfqd;
12047 ++
12048 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12049 ++ struct bfq_queue *async_idle_bfqq;
12050 ++
12051 ++ struct bfq_entity *my_entity;
12052 ++};
12053 ++
12054 ++/**
12055 ++ * struct bfqio_cgroup - bfq cgroup data structure.
12056 ++ * @css: subsystem state for bfq in the containing cgroup.
12057 ++ * @online: flag marked when the subsystem is inserted.
12058 ++ * @weight: cgroup weight.
12059 ++ * @ioprio: cgroup ioprio.
12060 ++ * @ioprio_class: cgroup ioprio_class.
12061 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
12062 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
12063 ++ *
12064 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
12065 ++ * @ioprio and @ioprio_class are protected by @lock.
12066 ++ */
12067 ++struct bfqio_cgroup {
12068 ++ struct cgroup_subsys_state css;
12069 ++ bool online;
12070 ++
12071 ++ unsigned short weight, ioprio, ioprio_class;
12072 ++
12073 ++ spinlock_t lock;
12074 ++ struct hlist_head group_data;
12075 ++};
12076 ++#else
12077 ++struct bfq_group {
12078 ++ struct bfq_sched_data sched_data;
12079 ++
12080 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12081 ++ struct bfq_queue *async_idle_bfqq;
12082 ++};
12083 ++#endif
12084 ++
12085 ++static inline struct bfq_service_tree *
12086 ++bfq_entity_service_tree(struct bfq_entity *entity)
12087 ++{
12088 ++ struct bfq_sched_data *sched_data = entity->sched_data;
12089 ++ unsigned int idx = entity->ioprio_class - 1;
12090 ++
12091 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
12092 ++ BUG_ON(sched_data == NULL);
12093 ++
12094 ++ return sched_data->service_tree + idx;
12095 ++}
12096 ++
12097 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
12098 ++ int is_sync)
12099 ++{
12100 ++ return bic->bfqq[!!is_sync];
12101 ++}
12102 ++
12103 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
12104 ++ struct bfq_queue *bfqq, int is_sync)
12105 ++{
12106 ++ bic->bfqq[!!is_sync] = bfqq;
12107 ++}
12108 ++
12109 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
12110 ++{
12111 ++ return bic->icq.q->elevator->elevator_data;
12112 ++}
12113 ++
12114 ++/**
12115 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
12116 ++ * @ptr: a pointer to a bfqd.
12117 ++ * @flags: storage for the flags to be saved.
12118 ++ *
12119 ++ * This function allows bfqg->bfqd to be protected by the
12120 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
12121 ++ * under RCU, so the storage for bfqd is assured to be safe as long
12122 ++ * as the RCU read side critical section does not end. After the
12123 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
12124 ++ * sure that no other writer accessed it. If we raced with a writer,
12125 ++ * the function returns NULL, with the queue unlocked, otherwise it
12126 ++ * returns the dereferenced pointer, with the queue locked.
12127 ++ */
12128 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
12129 ++ unsigned long *flags)
12130 ++{
12131 ++ struct bfq_data *bfqd;
12132 ++
12133 ++ rcu_read_lock();
12134 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
12135 ++
12136 ++ if (bfqd != NULL) {
12137 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
12138 ++ if (*ptr == bfqd)
12139 ++ goto out;
12140 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12141 ++ }
12142 ++
12143 ++ bfqd = NULL;
12144 ++out:
12145 ++ rcu_read_unlock();
12146 ++ return bfqd;
12147 ++}
12148 ++
12149 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
12150 ++ unsigned long *flags)
12151 ++{
12152 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12153 ++}
12154 ++
12155 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
12156 ++static void bfq_put_queue(struct bfq_queue *bfqq);
12157 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
12158 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
12159 ++ struct bfq_group *bfqg, int is_sync,
12160 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
12161 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
12162 ++ struct bfq_group *bfqg);
12163 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
12164 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
12165 ++#endif
12166 +--
12167 +1.9.0
12168 +