1 |
commit: c1e582586d398b4452f568240985247294f645ef |
2 |
Author: William Hubbs <w.d.hubbs <AT> gmail <DOT> com> |
3 |
AuthorDate: Tue Oct 9 22:49:02 2018 +0000 |
4 |
Commit: William Hubbs <williamh <AT> gentoo <DOT> org> |
5 |
CommitDate: Tue Oct 23 18:38:14 2018 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/openrc.git/commit/?id=c1e58258 |
7 |
|
8 |
supervise-daemon: add health checks |
9 |
|
10 |
Health checks are a way to monitor a service and make sure it stays |
11 |
healthy. |
12 |
|
13 |
If a service is not healthy, it will be automatically restarted after |
14 |
running the unhealthy() function to clean up. |
15 |
|
16 |
NEWS.md | 4 ++ |
17 |
man/supervise-daemon.8 | 9 +++ |
18 |
sh/supervise-daemon.sh | 14 +++++ |
19 |
src/rc/Makefile | 2 +- |
20 |
src/rc/supervise-daemon.c | 136 +++++++++++++++++++++++++++++++++++----------- |
21 |
supervise-daemon-guide.md | 36 ++++++++++++ |
22 |
6 files changed, 169 insertions(+), 32 deletions(-) |
23 |
|
24 |
diff --git a/NEWS.md b/NEWS.md |
25 |
index d4d96577..f1400197 100644 |
26 |
--- a/NEWS.md |
27 |
+++ b/NEWS.md |
28 |
@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to |
29 |
openrc-shutdown. Shutdowns can now be delayed for a certain amount of |
30 |
time or scheduled for an exact time. |
31 |
|
32 |
+supervise-daemon supports health checks, which are a periodic way to make sure a |
33 |
+service is healthy. For more information on setting this up, please see |
34 |
+supervise-daemon-guide.md. |
35 |
+ |
36 |
## OpenRC 0.37 |
37 |
|
38 |
start-stop-daemon now supports logging stdout and stderr of daemons to |
39 |
|
40 |
diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8 |
41 |
index af06ee31..8bcd8b5c 100644 |
42 |
--- a/man/supervise-daemon.8 |
43 |
+++ b/man/supervise-daemon.8 |
44 |
@@ -16,6 +16,10 @@ |
45 |
.Nd starts a daemon and restarts it if it crashes |
46 |
.Sh SYNOPSIS |
47 |
.Nm |
48 |
+.Fl a , -healthcheck-timer |
49 |
+.Ar seconds |
50 |
+.Fl A , -healthcheck-delay |
51 |
+.Ar seconds |
52 |
.Fl D , -respawn-delay |
53 |
.Ar seconds |
54 |
.Fl d , -chdir |
55 |
@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them. |
56 |
.Pp |
57 |
The options are as follows: |
58 |
.Bl -tag -width indent |
59 |
+.Fl a , -healthcheck-timer Ar seconds |
60 |
+Run the healthcheck() command, possibly followed by the unhealthy() |
61 |
+command every time this number of seconds passes. |
62 |
+.Fl A , -healthcheck-delay Ar seconds |
63 |
+Wait this long before the first health check. |
64 |
.It Fl D , -respawn-delay Ar seconds |
65 |
wait this number of seconds before restarting a daemon after it crashes. |
66 |
The default is 0. |
67 |
|
68 |
diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh |
69 |
index 80e0260c..73a70140 100644 |
70 |
--- a/sh/supervise-daemon.sh |
71 |
+++ b/sh/supervise-daemon.sh |
72 |
@@ -10,6 +10,8 @@ |
73 |
# This file may not be copied, modified, propagated, or distributed |
74 |
# except according to the terms contained in the LICENSE file. |
75 |
|
76 |
+extra_commands="healthcheck unhealthy ${extra_commands}" |
77 |
+ |
78 |
supervise_start() |
79 |
{ |
80 |
if [ -z "$command" ]; then |
81 |
@@ -32,6 +34,8 @@ supervise_start() |
82 |
${respawn_delay:+--respawn-delay} $respawn_delay \ |
83 |
${respawn_max:+--respawn-max} $respawn_max \ |
84 |
${respawn_period:+--respawn-period} $respawn_period \ |
85 |
+ ${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \ |
86 |
+ ${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \ |
87 |
${command_user+--user} $command_user \ |
88 |
${umask+--umask} $umask \ |
89 |
${supervise_daemon_args:-${start_stop_daemon_args}} \ |
90 |
@@ -98,3 +102,13 @@ supervise_status() |
91 |
return 3 |
92 |
fi |
93 |
} |
94 |
+ |
95 |
+healthcheck() |
96 |
+{ |
97 |
+ return 0 |
98 |
+} |
99 |
+ |
100 |
+unhealthy() |
101 |
+{ |
102 |
+ return 0 |
103 |
+} |
104 |
|
105 |
diff --git a/src/rc/Makefile b/src/rc/Makefile |
106 |
index 9ba240fa..ea4a8c81 100644 |
107 |
--- a/src/rc/Makefile |
108 |
+++ b/src/rc/Makefile |
109 |
@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o |
110 |
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o |
111 |
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} |
112 |
|
113 |
-supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o |
114 |
+supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o |
115 |
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} |
116 |
|
117 |
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o |
118 |
|
119 |
diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c |
120 |
index 27089152..883c738d 100644 |
121 |
--- a/src/rc/supervise-daemon.c |
122 |
+++ b/src/rc/supervise-daemon.c |
123 |
@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL}; |
124 |
#include "queue.h" |
125 |
#include "rc.h" |
126 |
#include "rc-misc.h" |
127 |
+#include "rc-plugin.h" |
128 |
#include "rc-schedules.h" |
129 |
#include "_usage.h" |
130 |
#include "helpers.h" |
131 |
|
132 |
const char *applet = NULL; |
133 |
const char *extraopts = NULL; |
134 |
-const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \ |
135 |
+const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \ |
136 |
getoptstring_COMMON; |
137 |
const struct option longopts[] = { |
138 |
+ { "healthcheck-timer", 1, NULL, 'a'}, |
139 |
+ { "healthcheck-delay", 1, NULL, 'A'}, |
140 |
{ "respawn-delay", 1, NULL, 'D'}, |
141 |
{ "chdir", 1, NULL, 'd'}, |
142 |
{ "env", 1, NULL, 'e'}, |
143 |
@@ -91,6 +94,8 @@ const struct option longopts[] = { |
144 |
longopts_COMMON |
145 |
}; |
146 |
const char * const longopts_help[] = { |
147 |
+ "set an initial health check delay", |
148 |
+ "set a health check timer", |
149 |
"Set a respawn delay", |
150 |
"Change the PWD", |
151 |
"Set an environment string", |
152 |
@@ -113,6 +118,9 @@ const char * const longopts_help[] = { |
153 |
}; |
154 |
const char *usagestring = NULL; |
155 |
|
156 |
+static int healthcheckdelay = 0; |
157 |
+static int healthchecktimer = 0; |
158 |
+static volatile sig_atomic_t do_healthcheck = 0; |
159 |
static int nicelevel = 0; |
160 |
static int ionicec = -1; |
161 |
static int ioniced = 0; |
162 |
@@ -183,6 +191,12 @@ static void handle_signal(int sig) |
163 |
re_exec_supervisor(); |
164 |
} |
165 |
|
166 |
+static void healthcheck(int sig) |
167 |
+{ |
168 |
+ if (sig == SIGALRM) |
169 |
+ do_healthcheck = 1; |
170 |
+} |
171 |
+ |
172 |
static char * expand_home(const char *home, const char *path) |
173 |
{ |
174 |
char *opath, *ppath, *p, *nh; |
175 |
@@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv) |
176 |
static void supervisor(char *exec, char **argv) |
177 |
{ |
178 |
FILE *fp; |
179 |
+ pid_t wait_pid; |
180 |
int i; |
181 |
int nkilled; |
182 |
struct timespec ts; |
183 |
time_t respawn_now= 0; |
184 |
time_t first_spawn= 0; |
185 |
+ pid_t health_pid; |
186 |
+ int health_status; |
187 |
|
188 |
#ifndef RC_DEBUG |
189 |
signal_setup_restart(SIGHUP, handle_signal); |
190 |
@@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv) |
191 |
* Supervisor main loop |
192 |
*/ |
193 |
i = 0; |
194 |
+ if (healthcheckdelay) { |
195 |
+ signal_setup(SIGALRM, healthcheck); |
196 |
+ alarm(healthcheckdelay); |
197 |
+ } else if (healthchecktimer) { |
198 |
+ signal_setup(SIGALRM, healthcheck); |
199 |
+ alarm(healthchecktimer); |
200 |
+ } |
201 |
while (!exiting) { |
202 |
- wait(&i); |
203 |
- if (exiting) { |
204 |
- signal_setup(SIGCHLD, SIG_IGN); |
205 |
- syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); |
206 |
- nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, |
207 |
- false, false, true); |
208 |
- if (nkilled > 0) |
209 |
- syslog(LOG_INFO, "killed %d processes", nkilled); |
210 |
- } else { |
211 |
- ts.tv_sec = respawn_delay; |
212 |
- ts.tv_nsec = 0; |
213 |
- nanosleep(&ts, NULL); |
214 |
- if (respawn_max > 0 && respawn_period > 0) { |
215 |
- respawn_now = time(NULL); |
216 |
- if (first_spawn == 0) |
217 |
- first_spawn = respawn_now; |
218 |
- if (respawn_now - first_spawn > respawn_period) { |
219 |
- respawn_count = 0; |
220 |
- first_spawn = 0; |
221 |
- } else |
222 |
- respawn_count++; |
223 |
- if (respawn_count > respawn_max) { |
224 |
- syslog(LOG_WARNING, |
225 |
- "respawned \"%s\" too many times, exiting", exec); |
226 |
- exiting = true; |
227 |
+ wait_pid = wait(&i); |
228 |
+ if (wait_pid == -1) { |
229 |
+ if (do_healthcheck) { |
230 |
+ do_healthcheck = 0; |
231 |
+ alarm(0); |
232 |
+ syslog(LOG_DEBUG, "running health check for %s", svcname); |
233 |
+ health_pid = exec_service(svcname, "healthcheck"); |
234 |
+ health_status = rc_waitpid(health_pid); |
235 |
+ if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) { |
236 |
+ alarm(healthchecktimer); |
237 |
continue; |
238 |
+ } else { |
239 |
+ syslog(LOG_WARNING, "health check for %s failed", svcname); |
240 |
+ health_pid = exec_service(svcname, "unhealthy"); |
241 |
+ rc_waitpid(health_pid); |
242 |
+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); |
243 |
+ nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0, |
244 |
+ false, false, true); |
245 |
+ if (nkilled > 0) |
246 |
+ syslog(LOG_INFO, "killed %d processes", nkilled); |
247 |
+ else if (errno != 0) |
248 |
+ syslog(LOG_INFO, "Unable to kill %d: %s", |
249 |
+ child_pid, strerror(errno)); |
250 |
} |
251 |
+ } else if (exiting ) { |
252 |
+ alarm(0); |
253 |
+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); |
254 |
+ nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, |
255 |
+ false, false, true); |
256 |
+ if (nkilled > 0) |
257 |
+ syslog(LOG_INFO, "killed %d processes", nkilled); |
258 |
+ continue; |
259 |
} |
260 |
+ } else if (wait_pid == child_pid) { |
261 |
if (WIFEXITED(i)) |
262 |
syslog(LOG_WARNING, "%s, pid %d, exited with return code %d", |
263 |
exec, child_pid, WEXITSTATUS(i)); |
264 |
else if (WIFSIGNALED(i)) |
265 |
syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d", |
266 |
exec, child_pid, WTERMSIG(i)); |
267 |
- child_pid = fork(); |
268 |
- if (child_pid == -1) |
269 |
- eerrorx("%s: fork: %s", applet, strerror(errno)); |
270 |
- if (child_pid == 0) |
271 |
- child_process(exec, argv); |
272 |
+ } else |
273 |
+ continue; |
274 |
+ |
275 |
+ ts.tv_sec = respawn_delay; |
276 |
+ ts.tv_nsec = 0; |
277 |
+ nanosleep(&ts, NULL); |
278 |
+ if (respawn_max > 0 && respawn_period > 0) { |
279 |
+ respawn_now = time(NULL); |
280 |
+ if (first_spawn == 0) |
281 |
+ first_spawn = respawn_now; |
282 |
+ if (respawn_now - first_spawn > respawn_period) { |
283 |
+ respawn_count = 0; |
284 |
+ first_spawn = 0; |
285 |
+ } else |
286 |
+ respawn_count++; |
287 |
+ if (respawn_count > respawn_max) { |
288 |
+ syslog(LOG_WARNING, |
289 |
+ "respawned \"%s\" too many times, exiting", exec); |
290 |
+ exiting = true; |
291 |
+ continue; |
292 |
+ } |
293 |
+ } |
294 |
+ alarm(0); |
295 |
+ child_pid = fork(); |
296 |
+ if (child_pid == -1) |
297 |
+ eerrorx("%s: fork: %s", applet, strerror(errno)); |
298 |
+ if (child_pid == 0) |
299 |
+ child_process(exec, argv); |
300 |
+ if (healthcheckdelay) { |
301 |
+ signal_setup(SIGALRM, healthcheck); |
302 |
+ alarm(healthcheckdelay); |
303 |
+ } else if (healthchecktimer) { |
304 |
+ signal_setup(SIGALRM, healthcheck); |
305 |
+ alarm(healthchecktimer); |
306 |
} |
307 |
} |
308 |
|
309 |
@@ -612,6 +671,16 @@ int main(int argc, char **argv) |
310 |
while ((opt = getopt_long(argc, argv, getoptstring, longopts, |
311 |
(int *) 0)) != -1) |
312 |
switch (opt) { |
313 |
+ case 'a': /* --healthcheck-timer <time> */ |
314 |
+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1) |
315 |
+ eerrorx("%s: invalid health check timer %s", applet, optarg); |
316 |
+ break; |
317 |
+ |
318 |
+ case 'A': /* --healthcheck-delay <time> */ |
319 |
+ if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1) |
320 |
+ eerrorx("%s: invalid health check delay %s", applet, optarg); |
321 |
+ break; |
322 |
+ |
323 |
case 'D': /* --respawn-delay time */ |
324 |
n = sscanf(optarg, "%d", &respawn_delay); |
325 |
if (n != 1 || respawn_delay < 1) |
326 |
@@ -668,6 +737,11 @@ int main(int argc, char **argv) |
327 |
gid = gr->gr_gid; |
328 |
break; |
329 |
|
330 |
+ case 'H': /* --healthcheck-timer <minutes> */ |
331 |
+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1) |
332 |
+ eerrorx("%s: invalid health check timer %s", applet, optarg); |
333 |
+ break; |
334 |
+ |
335 |
case 'k': |
336 |
if (parse_mode(&numask, optarg)) |
337 |
eerrorx("%s: invalid mode `%s'", |
338 |
|
339 |
diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md |
340 |
index 0b15a858..07ab55cf 100644 |
341 |
--- a/supervise-daemon-guide.md |
342 |
+++ b/supervise-daemon-guide.md |
343 |
@@ -22,6 +22,28 @@ The following is a brief guide on using this capability. |
344 |
instructs it not to fork to the command_args_foreground variable shown |
345 |
below. |
346 |
|
347 |
+# Health Checks |
348 |
+ |
349 |
+Health checks are a way to make sure a service monitored by |
350 |
+supervise-daemon stays healthy. To configure a health check for a |
351 |
+service, you need to write a healthcheck() function, and optionally an |
352 |
+unhealthy() function in the service script. Also, you will need to set |
353 |
+the healthcheck_timer and optionally healthcheck_delay variables. |
354 |
+ |
355 |
+## healthcheck() function |
356 |
+ |
357 |
+The healthcheck() function is run repeatedly based on the settings of |
358 |
+the healthcheck_* variables. This function should return zero if the |
359 |
+service is currently healthy or non-zero otherwise. |
360 |
+ |
361 |
+## unhealthy() function |
362 |
+ |
363 |
+If the healthcheck() function returns non-zero, the unhealthy() function |
364 |
+is run, then the service is restarted. Since the service will be |
365 |
+restarted by the supervisor, the unhealthy function should not try to |
366 |
+restart it; the purpose of the function is to allow any cleanup tasks |
367 |
+other than restarting the service to be run. |
368 |
+ |
369 |
# Variable Settings |
370 |
|
371 |
The most important setting is the supervisor variable. At the top of |
372 |
@@ -52,6 +74,20 @@ This should be used if the daemon you want to monitor |
373 |
forks and goes to the background by default. This should be set to the |
374 |
command line option that instructs the daemon to stay in the foreground. |
375 |
|
376 |
+``` sh |
377 |
+healthcheck_delay=seconds |
378 |
+``` |
379 |
+ |
380 |
+This is the delay, in seconds, before the first health check is run. |
381 |
+If it is not set, we use the value of healthcheck_timer. |
382 |
+ |
383 |
+``` sh |
384 |
+healthcheck_timer=seconds |
385 |
+``` |
386 |
+ |
387 |
+This is the number of seconds between health checks. If it is not set, |
388 |
+no health checks will be run. |
389 |
+ |
390 |
``` sh |
391 |
respawn_delay |
392 |
``` |