mysql-proxy源码中,使用了一种进程保活的方法。这种方法的基本原理是:当父进程完成基本的初始化后,创建子进程,由子进程继续后面的主体逻辑。而父进程wait子进程的退出状态。一旦发现子进程是由于收到信号而退出的,则重启子进程。
这种方法的实现代码如下:
#include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <unistd.h> #include <errno.h> #include <signal.h> #include <string.h> #include <sys/resource.h> static void signal_forward(int sig) { signal(sig, SIG_IGN); /* we don't want to create a loop here */ kill(0, sig); } /** * keep the ourself alive * * if we or the child gets a SIGTERM, we quit too * on everything else we restart it */ int chassis_unix_proc_keepalive() { int nprocs = 0; pid_t child_pid = -1; /* we ignore SIGINT and SIGTERM and just let it be forwarded to the child instead * as we want to collect its PID before we shutdown too * * the child will have to set its own signal handlers for this */ for (;;) { /* try to start the children */ while (nprocs < 1) { pid_t pid = fork(); if (pid == 0) { /* child */ printf("we are the child: %d ", getpid()); return 0; } else if (pid < 0) { /* fork() failed */ printf("fork() failed: %s[%d] ", strerror(errno), errno); return -1; } else { /* we are the angel, let's see what the child did */ printf("[father]: we try to keep PID=%d alive ", pid); /* forward a few signals that are sent to us to the child instead */ signal(SIGINT, signal_forward); signal(SIGTERM, signal_forward); signal(SIGHUP, signal_forward); signal(SIGUSR1, signal_forward); signal(SIGUSR2, signal_forward); child_pid = pid; nprocs++; } } if (child_pid != -1) { struct rusage rusage; int exit_status; pid_t exit_pid; printf("[father]: waiting for %d ", child_pid); #ifdef HAVE_WAIT4 exit_pid = wait4(child_pid, &exit_status, 0, &rusage); #else memset(&rusage, 0, sizeof(rusage)); /* make sure everything is zero'ed out */ exit_pid = waitpid(child_pid, &exit_status, 0); #endif printf("[father]: %d returned: %d ", child_pid, exit_pid); if (exit_pid == child_pid) { /* our child returned, let's see how it went */ if (WIFEXITED(exit_status)) { printf("[father]: PID=%d exited normally with exit-code = %d (it used %ld kBytes max) ", child_pid, WEXITSTATUS(exit_status), rusage.ru_maxrss / 1024); return 1; } else if (WIFSIGNALED(exit_status)) { int time_towait = 60; /* our child died on a signal * * log it and restart */ printf("[father]: PID=%d died on signal=%d (it used %ld kBytes max) ... waiting 1min before restart ", child_pid, WTERMSIG(exit_status), rusage.ru_maxrss / 1024); /** * to make sure we don't loop as fast as we can, sleep a bit between * restarts */ signal(SIGINT, SIG_DFL); signal(SIGTERM, SIG_DFL); signal(SIGHUP, SIG_DFL); while (time_towait > 0) time_towait = sleep(time_towait); nprocs--; child_pid = -1; } else if (WIFSTOPPED(exit_status)) { } else { printf("[father]: should not reached "); } } else if (-1 == exit_pid) { /* EINTR is ok, all others bad */ if (EINTR != errno) { /* how can this happen ? */ printf("[father]: wait4(%d, ...) failed: %s[%d] ", child_pid, strerror(errno), errno); return -1; } } else { printf("[father]: should not reached "); } } } } int main() { int ret = chassis_unix_proc_keepalive(); if (ret > 0) { exit(0); } else if (ret < 0) { exit(-1); } else { /* we are the child, go on */ } for(;;) { printf("hello, world "); sleep(10); } }
这里的主体逻辑,就是每隔10秒打印一次” hello,world”。程序运行结果如下:
[father]: we try to keep PID=1824 alive [father]: waiting for 1824 we are the child: 1824 hello, world hello, world ... (向子进程发送SIGKILL信号) [father]: 1824 returned: 1824 [father]: PID=1824 died on signal=9 (it used 0 kBytes max) ... waiting 1min before restart [father]: we try to keep PID=1853 alive [father]: waiting for 1853 we are the child: 1853 hello, world hello, world hello, world ... (向父进程发送SIGINT信号) [father]: 1853 returned: 1853 [father]: PID=1853 died on signal=2 (it used 0 kBytes max) ... waiting 1min before restart [father]: we try to keep PID=1870 alive [father]: waiting for 1870 we are the child: 1870 hello, world hello, world hello, world hello, world ...