[gentoo-commits] proj/openrc:master commit in: /, src/rc/, man/, sh/ - gentoo-commits

From:	William Hubbs <williamh@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/openrc:master commit in: /, src/rc/, man/, sh/
Date:	Tue, 23 Oct 2018 22:00:25
Message-Id:	`1540319894.c1e582586d398b4452f568240985247294f645ef.williamh@OpenRC`

1

commit:     c1e582586d398b4452f568240985247294f645ef

2

Author:     William Hubbs <w.d.hubbs <AT> gmail <DOT> com>

3

AuthorDate: Tue Oct  9 22:49:02 2018 +0000

4

Commit:     William Hubbs <williamh <AT> gentoo <DOT> org>

5

CommitDate: Tue Oct 23 18:38:14 2018 +0000

6

URL:        https://gitweb.gentoo.org/proj/openrc.git/commit/?id=c1e58258

7

8

supervise-daemon: add health checks

9

10

Health checks are a way to monitor a service and make sure it stays

11

healthy.

12

13

If a service is not healthy, it will be automatically restarted after

14

running the unhealthy() function to clean up.

15

16

 NEWS.md                   |   4 ++

17

 man/supervise-daemon.8    |   9 +++

18

 sh/supervise-daemon.sh    |  14 +++++

19

 src/rc/Makefile           |   2 +-

20

 src/rc/supervise-daemon.c | 136 +++++++++++++++++++++++++++++++++++-----------

21

 supervise-daemon-guide.md |  36 ++++++++++++

22

 6 files changed, 169 insertions(+), 32 deletions(-)

23

24

diff --git a/NEWS.md b/NEWS.md

25

index d4d96577..f1400197 100644

26

--- a/NEWS.md

27

+++ b/NEWS.md

28

@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to

29

 openrc-shutdown. Shutdowns can now be delayed for a certain amount of

30

 time or scheduled for an exact time.

31

32

+supervise-daemon supports health checks, which are a periodic way to make sure a

33

+service is healthy. For more information on setting this up, please see

34

+supervise-daemon-guide.md.

35

+

36

 ## OpenRC 0.37

37

38

 start-stop-daemon now supports logging stdout and stderr of daemons to

39

40

diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8

41

index af06ee31..8bcd8b5c 100644

42

--- a/man/supervise-daemon.8

43

+++ b/man/supervise-daemon.8

44

@@ -16,6 +16,10 @@

45

 .Nd starts a daemon and restarts it if it crashes

46

 .Sh SYNOPSIS

47

.Nm

48

+.Fl a , -healthcheck-timer

49

+.Ar seconds

50

+.Fl A , -healthcheck-delay

51

+.Ar seconds

52

 .Fl D , -respawn-delay

53

 .Ar seconds

54

 .Fl d , -chdir

55

@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.

56

.Pp

57

 The options are as follows:

58

 .Bl -tag -width indent

59

+.Fl a , -healthcheck-timer Ar seconds

60

+Run the healthcheck() command, possibly followed by the unhealthy()

61

+command every time this number of seconds passes.

62

+.Fl A , -healthcheck-delay Ar seconds

63

+Wait this long before the first health check.

64

 .It Fl D , -respawn-delay Ar seconds

65

 wait this number of seconds before restarting a daemon after it crashes.

66

 The default is 0.

67

68

diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh

69

index 80e0260c..73a70140 100644

70

--- a/sh/supervise-daemon.sh

71

+++ b/sh/supervise-daemon.sh

72

@@ -10,6 +10,8 @@

73

 # This file may not be copied, modified, propagated, or distributed

74

 #    except according to the terms contained in the LICENSE file.

75

76

+extra_commands="healthcheck unhealthy ${extra_commands}"

77

+

78

 supervise_start()

79

{

80

 	if [ -z "$command" ]; then

81

@@ -32,6 +34,8 @@ supervise_start()

82

 		${respawn_delay:+--respawn-delay} $respawn_delay \

83

 		${respawn_max:+--respawn-max} $respawn_max \

84

 		${respawn_period:+--respawn-period} $respawn_period \

85

+		${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \

86

+		${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \

87

 		${command_user+--user} $command_user \

88

 		${umask+--umask} $umask \

89

 		${supervise_daemon_args:-${start_stop_daemon_args}} \

90

@@ -98,3 +102,13 @@ supervise_status()

91

 		return 3

92

fi

93

}

94

+

95

+healthcheck()

96

+{

97

+	return 0

98

+}

99

+

100

+unhealthy()

101

+{

102

+	return 0

103

+}

104

105

diff --git a/src/rc/Makefile b/src/rc/Makefile

106

index 9ba240fa..ea4a8c81 100644

107

--- a/src/rc/Makefile

108

+++ b/src/rc/Makefile

109

@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o

110

 start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o

111

 	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}

112

113

-supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o

114

+supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o

115

 	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}

116

117

 service_get_value service_set_value get_options save_options: do_value.o rc-misc.o

118

119

diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c

120

index 27089152..883c738d 100644

121

--- a/src/rc/supervise-daemon.c

122

+++ b/src/rc/supervise-daemon.c

123

@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};

124

 #include "queue.h"

125

 #include "rc.h"

126

 #include "rc-misc.h"

127

+#include "rc-plugin.h"

128

 #include "rc-schedules.h"

129

 #include "_usage.h"

130

 #include "helpers.h"

131

132

 const char *applet = NULL;

133

 const char *extraopts = NULL;

134

-const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \

135

+const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \

136

 	getoptstring_COMMON;

137

 const struct option longopts[] = {

138

+	{ "healthcheck-timer",        1, NULL, 'a'},

139

+	{ "healthcheck-delay",        1, NULL, 'A'},

140

 	{ "respawn-delay",        1, NULL, 'D'},

141

 	{ "chdir",        1, NULL, 'd'},

142

 	{ "env",          1, NULL, 'e'},

143

@@ -91,6 +94,8 @@ const struct option longopts[] = {

144

 	longopts_COMMON

145

};

146

 const char * const longopts_help[] = {

147

+	"set an initial health check delay",

148

+	"set a health check timer",

149

 	"Set a respawn delay",

150

 	"Change the PWD",

151

 	"Set an environment string",

152

@@ -113,6 +118,9 @@ const char * const longopts_help[] = {

153

};

154

 const char *usagestring = NULL;

155

156

+static int healthcheckdelay = 0;

157

+static int healthchecktimer = 0;

158

+static volatile sig_atomic_t do_healthcheck = 0;

159

 static int nicelevel = 0;

160

 static int ionicec = -1;

161

 static int ioniced = 0;

162

@@ -183,6 +191,12 @@ static void handle_signal(int sig)

163

 		re_exec_supervisor();

164

}

165

166

+static void healthcheck(int sig)

167

+{

168

+	if (sig == SIGALRM)

169

+		do_healthcheck = 1;

170

+}

171

+

172

 static char * expand_home(const char *home, const char *path)

173

{

174

 	char *opath, *ppath, *p, *nh;

175

@@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)

176

 static void supervisor(char *exec, char **argv)

177

{

178

 	FILE *fp;

179

+	pid_t wait_pid;

180

 	int i;

181

 	int nkilled;

182

 	struct timespec ts;

183

 	time_t respawn_now= 0;

184

 	time_t first_spawn= 0;

185

+	pid_t health_pid;

186

+	int health_status;

187

188

 #ifndef RC_DEBUG

189

 	signal_setup_restart(SIGHUP, handle_signal);

190

@@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)

191

 	 * Supervisor main loop

192

*/

193

 	i = 0;

194

+	if (healthcheckdelay) {

195

+		signal_setup(SIGALRM, healthcheck);

196

+		alarm(healthcheckdelay);

197

+	} else if (healthchecktimer) {

198

+		signal_setup(SIGALRM, healthcheck);

199

+		alarm(healthchecktimer);

200

+	}

201

 	while (!exiting) {

202

-		wait(&i);

203

-		if (exiting) {

204

-			signal_setup(SIGCHLD, SIG_IGN);

205

-			syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);

206

-			nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,

207

-					false, false, true);

208

-			if (nkilled > 0)

209

-				syslog(LOG_INFO, "killed %d processes", nkilled);

210

-		} else {

211

-			ts.tv_sec = respawn_delay;

212

-			ts.tv_nsec = 0;

213

-			nanosleep(&ts, NULL);

214

-			if (respawn_max > 0 && respawn_period > 0) {

215

-				respawn_now = time(NULL);

216

-				if (first_spawn == 0)

217

-					first_spawn = respawn_now;

218

-				if (respawn_now - first_spawn > respawn_period) {

219

-					respawn_count = 0;

220

-					first_spawn = 0;

221

-				} else

222

-					respawn_count++;

223

-				if (respawn_count > respawn_max) {

224

-					syslog(LOG_WARNING,

225

-							"respawned \"%s\" too many times, exiting", exec);

226

-					exiting = true;

227

+		wait_pid = wait(&i);

228

+		if (wait_pid == -1) {

229

+			if (do_healthcheck) {

230

+				do_healthcheck = 0;

231

+				alarm(0);

232

+				syslog(LOG_DEBUG, "running health check for %s", svcname);

233

+				health_pid = exec_service(svcname, "healthcheck");

234

+				health_status = rc_waitpid(health_pid);

235

+				if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {

236

+					alarm(healthchecktimer);

237

 					continue;

238

+				} else {

239

+					syslog(LOG_WARNING, "health check for %s failed", svcname);

240

+					health_pid = exec_service(svcname, "unhealthy");

241

+					rc_waitpid(health_pid);

242

+					syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);

243

+					nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,

244

+							false, false, true);

245

+					if (nkilled > 0)

246

+						syslog(LOG_INFO, "killed %d processes", nkilled);

247

+					else if (errno != 0)

248

+						syslog(LOG_INFO, "Unable to kill %d: %s",

249

+								child_pid, strerror(errno));

250

}

251

+			} else if (exiting ) {

252

+				alarm(0);

253

+				syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);

254

+				nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,

255

+						false, false, true);

256

+				if (nkilled > 0)

257

+					syslog(LOG_INFO, "killed %d processes", nkilled);

258

+				continue;

259

}

260

+		} else if (wait_pid == child_pid) {

261

 			if (WIFEXITED(i))

262

 				syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",

263

 						exec, child_pid, WEXITSTATUS(i));

264

 			else if (WIFSIGNALED(i))

265

 				syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",

266

 						exec, child_pid, WTERMSIG(i));

267

-			child_pid = fork();

268

-			if (child_pid == -1)

269

-				eerrorx("%s: fork: %s", applet, strerror(errno));

270

-			if (child_pid == 0)

271

-				child_process(exec, argv);

272

+		} else

273

+			continue;

274

+

275

+		ts.tv_sec = respawn_delay;

276

+		ts.tv_nsec = 0;

277

+		nanosleep(&ts, NULL);

278

+		if (respawn_max > 0 && respawn_period > 0) {

279

+			respawn_now = time(NULL);

280

+			if (first_spawn == 0)

281

+				first_spawn = respawn_now;

282

+			if (respawn_now - first_spawn > respawn_period) {

283

+				respawn_count = 0;

284

+				first_spawn = 0;

285

+			} else

286

+				respawn_count++;

287

+			if (respawn_count > respawn_max) {

288

+				syslog(LOG_WARNING,

289

+						"respawned \"%s\" too many times, exiting", exec);

290

+				exiting = true;

291

+				continue;

292

+			}

293

+		}

294

+		alarm(0);

295

+		child_pid = fork();

296

+		if (child_pid == -1)

297

+			eerrorx("%s: fork: %s", applet, strerror(errno));

298

+		if (child_pid == 0)

299

+			child_process(exec, argv);

300

+		if (healthcheckdelay) {

301

+			signal_setup(SIGALRM, healthcheck);

302

+			alarm(healthcheckdelay);

303

+		} else if (healthchecktimer) {

304

+			signal_setup(SIGALRM, healthcheck);

305

+			alarm(healthchecktimer);

306

}

307

}

308

309

@@ -612,6 +671,16 @@ int main(int argc, char **argv)

310

 	while ((opt = getopt_long(argc, argv, getoptstring, longopts,

311

 		    (int *) 0)) != -1)

312

 		switch (opt) {

313

+		case 'a':  /* --healthcheck-timer <time> */

314

+			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)

315

+				eerrorx("%s: invalid health check timer %s", applet, optarg);

316

+			break;

317

+

318

+		case 'A':  /* --healthcheck-delay <time> */

319

+			if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)

320

+				eerrorx("%s: invalid health check delay %s", applet, optarg);

321

+			break;

322

+

323

 		case 'D':  /* --respawn-delay time */

324

 			n = sscanf(optarg, "%d", &respawn_delay);

325

 			if (n	!= 1 || respawn_delay < 1)

326

@@ -668,6 +737,11 @@ int main(int argc, char **argv)

327

 			gid = gr->gr_gid;

328

 			break;

329

330

+		case 'H':  /* --healthcheck-timer <minutes> */

331

+			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)

332

+				eerrorx("%s: invalid health check timer %s", applet, optarg);

333

+			break;

334

+

335

 		case 'k':

336

 			if (parse_mode(&numask, optarg))

337

 				eerrorx("%s: invalid mode `%s'",

338

339

diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md

340

index 0b15a858..07ab55cf 100644

341

--- a/supervise-daemon-guide.md

342

+++ b/supervise-daemon-guide.md

343

@@ -22,6 +22,28 @@ The following is a brief guide on using this capability.

344

   instructs it not to fork to the command_args_foreground variable shown

345

   below.

346

347

+# Health Checks

348

+

349

+Health checks are a way to make sure a service monitored by

350

+supervise-daemon stays healthy. To configure a health check for a

351

+service, you need to write a healthcheck() function, and optionally an

352

+unhealthy() function in the service script. Also, you will need to set

353

+the healthcheck_timer and optionally healthcheck_delay variables.

354

+

355

+## healthcheck() function

356

+

357

+The healthcheck() function is run repeatedly based on the settings of

358

+the healthcheck_* variables. This function should return zero if the

359

+service is currently healthy or non-zero otherwise.

360

+

361

+## unhealthy() function

362

+

363

+If the healthcheck() function returns non-zero, the unhealthy() function

364

+is run, then the service is restarted. Since the service will be

365

+restarted by the supervisor, the unhealthy function should not try to

366

+restart it; the purpose of the function is to allow any cleanup tasks

367

+other than restarting the service to be run.

368

+

369

 # Variable Settings

370

371

 The most important setting is the supervisor variable. At the top of

372

@@ -52,6 +74,20 @@ This 	should be used if the daemon you want to monitor

373

 forks and goes to the background by default. This should be set to the

374

 command line option that instructs the daemon to stay in the foreground.

375

376

+``` sh

377

+healthcheck_delay=seconds

378

+```

379

+

380

+This is the delay, in seconds, before the first health check is run.

381

+If it is not set, we use the value of healthcheck_timer.

382

+

383

+``` sh

384

+healthcheck_timer=seconds

385

+```

386

+

387

+This is the  number of seconds between health checks. If it is not set,

388

+no health checks will be run.

389

+

390

 ``` sh

391

 respawn_delay

392

```

1	commit: c1e582586d398b4452f568240985247294f645ef
2	Author: William Hubbs <w.d.hubbs <AT> gmail <DOT> com>
3	AuthorDate: Tue Oct 9 22:49:02 2018 +0000
4	Commit: William Hubbs <williamh <AT> gentoo <DOT> org>
5	CommitDate: Tue Oct 23 18:38:14 2018 +0000
6	URL: https://gitweb.gentoo.org/proj/openrc.git/commit/?id=c1e58258
7
8	supervise-daemon: add health checks
9
10	Health checks are a way to monitor a service and make sure it stays
11	healthy.
12
13	If a service is not healthy, it will be automatically restarted after
14	running the unhealthy() function to clean up.
15
16	NEWS.md \| 4 ++
17	man/supervise-daemon.8 \| 9 +++
18	sh/supervise-daemon.sh \| 14 +++++
19	src/rc/Makefile \| 2 +-
20	src/rc/supervise-daemon.c \| 136 +++++++++++++++++++++++++++++++++++-----------
21	supervise-daemon-guide.md \| 36 ++++++++++++
22	6 files changed, 169 insertions(+), 32 deletions(-)
23
24	diff --git a/NEWS.md b/NEWS.md
25	index d4d96577..f1400197 100644
26	--- a/NEWS.md
27	+++ b/NEWS.md
28	@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
29	openrc-shutdown. Shutdowns can now be delayed for a certain amount of
30	time or scheduled for an exact time.
31
32	+supervise-daemon supports health checks, which are a periodic way to make sure a
33	+service is healthy. For more information on setting this up, please see
34	+supervise-daemon-guide.md.
35	+
36	## OpenRC 0.37
37
38	start-stop-daemon now supports logging stdout and stderr of daemons to
39
40	diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8
41	index af06ee31..8bcd8b5c 100644
42	--- a/man/supervise-daemon.8
43	+++ b/man/supervise-daemon.8
44	@@ -16,6 +16,10 @@
45	.Nd starts a daemon and restarts it if it crashes
46	.Sh SYNOPSIS
47	.Nm
48	+.Fl a , -healthcheck-timer
49	+.Ar seconds
50	+.Fl A , -healthcheck-delay
51	+.Ar seconds
52	.Fl D , -respawn-delay
53	.Ar seconds
54	.Fl d , -chdir
55	@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
56	.Pp
57	The options are as follows:
58	.Bl -tag -width indent
59	+.Fl a , -healthcheck-timer Ar seconds
60	+Run the healthcheck() command, possibly followed by the unhealthy()
61	+command every time this number of seconds passes.
62	+.Fl A , -healthcheck-delay Ar seconds
63	+Wait this long before the first health check.
64	.It Fl D , -respawn-delay Ar seconds
65	wait this number of seconds before restarting a daemon after it crashes.
66	The default is 0.
67
68	diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh
69	index 80e0260c..73a70140 100644
70	--- a/sh/supervise-daemon.sh
71	+++ b/sh/supervise-daemon.sh
72	@@ -10,6 +10,8 @@
73	# This file may not be copied, modified, propagated, or distributed
74	# except according to the terms contained in the LICENSE file.
75
76	+extra_commands="healthcheck unhealthy ${extra_commands}"
77	+
78	supervise_start()
79	{
80	if [ -z "$command" ]; then
81	@@ -32,6 +34,8 @@ supervise_start()
82	${respawn_delay:+--respawn-delay} $respawn_delay \
83	${respawn_max:+--respawn-max} $respawn_max \
84	${respawn_period:+--respawn-period} $respawn_period \
85	+ ${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
86	+ ${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
87	${command_user+--user} $command_user \
88	${umask+--umask} $umask \
89	${supervise_daemon_args:-${start_stop_daemon_args}} \
90	@@ -98,3 +102,13 @@ supervise_status()
91	return 3
92	fi
93	}
94	+
95	+healthcheck()
96	+{
97	+ return 0
98	+}
99	+
100	+unhealthy()
101	+{
102	+ return 0
103	+}
104
105	diff --git a/src/rc/Makefile b/src/rc/Makefile
106	index 9ba240fa..ea4a8c81 100644
107	--- a/src/rc/Makefile
108	+++ b/src/rc/Makefile
109	@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
110	start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
111	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
112
113	-supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
114	+supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
115	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
116
117	service_get_value service_set_value get_options save_options: do_value.o rc-misc.o
118
119	diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c
120	index 27089152..883c738d 100644
121	--- a/src/rc/supervise-daemon.c
122	+++ b/src/rc/supervise-daemon.c
123	@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
124	#include "queue.h"
125	#include "rc.h"
126	#include "rc-misc.h"
127	+#include "rc-plugin.h"
128	#include "rc-schedules.h"
129	#include "_usage.h"
130	#include "helpers.h"
131
132	const char *applet = NULL;
133	const char *extraopts = NULL;
134	-const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
135	+const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
136	getoptstring_COMMON;
137	const struct option longopts[] = {
138	+ { "healthcheck-timer", 1, NULL, 'a'},
139	+ { "healthcheck-delay", 1, NULL, 'A'},
140	{ "respawn-delay", 1, NULL, 'D'},
141	{ "chdir", 1, NULL, 'd'},
142	{ "env", 1, NULL, 'e'},
143	@@ -91,6 +94,8 @@ const struct option longopts[] = {
144	longopts_COMMON
145	};
146	const char * const longopts_help[] = {
147	+ "set an initial health check delay",
148	+ "set a health check timer",
149	"Set a respawn delay",
150	"Change the PWD",
151	"Set an environment string",
152	@@ -113,6 +118,9 @@ const char * const longopts_help[] = {
153	};
154	const char *usagestring = NULL;
155
156	+static int healthcheckdelay = 0;
157	+static int healthchecktimer = 0;
158	+static volatile sig_atomic_t do_healthcheck = 0;
159	static int nicelevel = 0;
160	static int ionicec = -1;
161	static int ioniced = 0;
162	@@ -183,6 +191,12 @@ static void handle_signal(int sig)
163	re_exec_supervisor();
164	}
165
166	+static void healthcheck(int sig)
167	+{
168	+ if (sig == SIGALRM)
169	+ do_healthcheck = 1;
170	+}
171	+
172	static char * expand_home(const char home, const char path)
173	{
174	char opath, ppath, p, nh;
175	@@ -423,11 +437,14 @@ static void child_process(char exec, char *argv)
176	static void supervisor(char exec, char *argv)
177	{
178	FILE *fp;
179	+ pid_t wait_pid;
180	int i;
181	int nkilled;
182	struct timespec ts;
183	time_t respawn_now= 0;
184	time_t first_spawn= 0;
185	+ pid_t health_pid;
186	+ int health_status;
187
188	#ifndef RC_DEBUG
189	signal_setup_restart(SIGHUP, handle_signal);
190	@@ -488,46 +505,88 @@ static void supervisor(char exec, char *argv)
191	* Supervisor main loop
192	*/
193	i = 0;
194	+ if (healthcheckdelay) {
195	+ signal_setup(SIGALRM, healthcheck);
196	+ alarm(healthcheckdelay);
197	+ } else if (healthchecktimer) {
198	+ signal_setup(SIGALRM, healthcheck);
199	+ alarm(healthchecktimer);
200	+ }
201	while (!exiting) {
202	- wait(&i);
203	- if (exiting) {
204	- signal_setup(SIGCHLD, SIG_IGN);
205	- syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
206	- nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
207	- false, false, true);
208	- if (nkilled > 0)
209	- syslog(LOG_INFO, "killed %d processes", nkilled);
210	- } else {
211	- ts.tv_sec = respawn_delay;
212	- ts.tv_nsec = 0;
213	- nanosleep(&ts, NULL);
214	- if (respawn_max > 0 && respawn_period > 0) {
215	- respawn_now = time(NULL);
216	- if (first_spawn == 0)
217	- first_spawn = respawn_now;
218	- if (respawn_now - first_spawn > respawn_period) {
219	- respawn_count = 0;
220	- first_spawn = 0;
221	- } else
222	- respawn_count++;
223	- if (respawn_count > respawn_max) {
224	- syslog(LOG_WARNING,
225	- "respawned \"%s\" too many times, exiting", exec);
226	- exiting = true;
227	+ wait_pid = wait(&i);
228	+ if (wait_pid == -1) {
229	+ if (do_healthcheck) {
230	+ do_healthcheck = 0;
231	+ alarm(0);
232	+ syslog(LOG_DEBUG, "running health check for %s", svcname);
233	+ health_pid = exec_service(svcname, "healthcheck");
234	+ health_status = rc_waitpid(health_pid);
235	+ if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
236	+ alarm(healthchecktimer);
237	continue;
238	+ } else {
239	+ syslog(LOG_WARNING, "health check for %s failed", svcname);
240	+ health_pid = exec_service(svcname, "unhealthy");
241	+ rc_waitpid(health_pid);
242	+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
243	+ nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
244	+ false, false, true);
245	+ if (nkilled > 0)
246	+ syslog(LOG_INFO, "killed %d processes", nkilled);
247	+ else if (errno != 0)
248	+ syslog(LOG_INFO, "Unable to kill %d: %s",
249	+ child_pid, strerror(errno));
250	}
251	+ } else if (exiting ) {
252	+ alarm(0);
253	+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
254	+ nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
255	+ false, false, true);
256	+ if (nkilled > 0)
257	+ syslog(LOG_INFO, "killed %d processes", nkilled);
258	+ continue;
259	}
260	+ } else if (wait_pid == child_pid) {
261	if (WIFEXITED(i))
262	syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
263	exec, child_pid, WEXITSTATUS(i));
264	else if (WIFSIGNALED(i))
265	syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
266	exec, child_pid, WTERMSIG(i));
267	- child_pid = fork();
268	- if (child_pid == -1)
269	- eerrorx("%s: fork: %s", applet, strerror(errno));
270	- if (child_pid == 0)
271	- child_process(exec, argv);
272	+ } else
273	+ continue;
274	+
275	+ ts.tv_sec = respawn_delay;
276	+ ts.tv_nsec = 0;
277	+ nanosleep(&ts, NULL);
278	+ if (respawn_max > 0 && respawn_period > 0) {
279	+ respawn_now = time(NULL);
280	+ if (first_spawn == 0)
281	+ first_spawn = respawn_now;
282	+ if (respawn_now - first_spawn > respawn_period) {
283	+ respawn_count = 0;
284	+ first_spawn = 0;
285	+ } else
286	+ respawn_count++;
287	+ if (respawn_count > respawn_max) {
288	+ syslog(LOG_WARNING,
289	+ "respawned \"%s\" too many times, exiting", exec);
290	+ exiting = true;
291	+ continue;
292	+ }
293	+ }
294	+ alarm(0);
295	+ child_pid = fork();
296	+ if (child_pid == -1)
297	+ eerrorx("%s: fork: %s", applet, strerror(errno));
298	+ if (child_pid == 0)
299	+ child_process(exec, argv);
300	+ if (healthcheckdelay) {
301	+ signal_setup(SIGALRM, healthcheck);
302	+ alarm(healthcheckdelay);
303	+ } else if (healthchecktimer) {
304	+ signal_setup(SIGALRM, healthcheck);
305	+ alarm(healthchecktimer);
306	}
307	}
308
309	@@ -612,6 +671,16 @@ int main(int argc, char **argv)
310	while ((opt = getopt_long(argc, argv, getoptstring, longopts,
311	(int *) 0)) != -1)
312	switch (opt) {
313	+ case 'a': /* --healthcheck-timer <time> */
314	+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 \|\| healthchecktimer < 1)
315	+ eerrorx("%s: invalid health check timer %s", applet, optarg);
316	+ break;
317	+
318	+ case 'A': /* --healthcheck-delay <time> */
319	+ if (sscanf(optarg, "%d", &healthcheckdelay) != 1 \|\| healthcheckdelay < 1)
320	+ eerrorx("%s: invalid health check delay %s", applet, optarg);
321	+ break;
322	+
323	case 'D': /* --respawn-delay time */
324	n = sscanf(optarg, "%d", &respawn_delay);
325	if (n != 1 \|\| respawn_delay < 1)
326	@@ -668,6 +737,11 @@ int main(int argc, char **argv)
327	gid = gr->gr_gid;
328	break;
329
330	+ case 'H': /* --healthcheck-timer <minutes> */
331	+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 \|\| healthchecktimer < 1)
332	+ eerrorx("%s: invalid health check timer %s", applet, optarg);
333	+ break;
334	+
335	case 'k':
336	if (parse_mode(&numask, optarg))
337	eerrorx("%s: invalid mode `%s'",
338
339	diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md
340	index 0b15a858..07ab55cf 100644
341	--- a/supervise-daemon-guide.md
342	+++ b/supervise-daemon-guide.md
343	@@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
344	instructs it not to fork to the command_args_foreground variable shown
345	below.
346
347	+# Health Checks
348	+
349	+Health checks are a way to make sure a service monitored by
350	+supervise-daemon stays healthy. To configure a health check for a
351	+service, you need to write a healthcheck() function, and optionally an
352	+unhealthy() function in the service script. Also, you will need to set
353	+the healthcheck_timer and optionally healthcheck_delay variables.
354	+
355	+## healthcheck() function
356	+
357	+The healthcheck() function is run repeatedly based on the settings of
358	+the healthcheck_* variables. This function should return zero if the
359	+service is currently healthy or non-zero otherwise.
360	+
361	+## unhealthy() function
362	+
363	+If the healthcheck() function returns non-zero, the unhealthy() function
364	+is run, then the service is restarted. Since the service will be
365	+restarted by the supervisor, the unhealthy function should not try to
366	+restart it; the purpose of the function is to allow any cleanup tasks
367	+other than restarting the service to be run.
368	+
369	# Variable Settings
370
371	The most important setting is the supervisor variable. At the top of
372	@@ -52,6 +74,20 @@ This should be used if the daemon you want to monitor
373	forks and goes to the background by default. This should be set to the
374	command line option that instructs the daemon to stay in the foreground.
375
376	+``` sh
377	+healthcheck_delay=seconds
378	+```
379	+
380	+This is the delay, in seconds, before the first health check is run.
381	+If it is not set, we use the value of healthcheck_timer.
382	+
383	+``` sh
384	+healthcheck_timer=seconds
385	+```
386	+
387	+This is the number of seconds between health checks. If it is not set,
388	+no health checks will be run.
389	+
390	``` sh
391	respawn_delay
392	```

Gentoo Archives: gentoo-commits