trafficserver-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From iga...@apache.org
Subject [1/6] git commit: traffic_cop: wait() for children on SIGTERM
Date Tue, 15 Jan 2013 20:10:57 GMT
traffic_cop: wait() for children on SIGTERM

rework traffic_cop's signal handling to register a handler for
SIGTERM. In this handler we now send SIGTERM to all children
and then wait() for them.
This should allow us to handle restarts in upstart more gracefully.

Finally, we simplify the handling on Linux by no longer parsing
/proc: It's not necessary. Linux is a POSIX system too.

Conflicts:

	lib/ts/lockfile.cc


Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/b20c7404
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/b20c7404
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/b20c7404

Branch: refs/heads/3.2.x
Commit: b20c7404f8775ffc088e6833622fa8440f8d0038
Parents: c29a84f
Author: Igor Galić <i.galic@brainsware.org>
Authored: Tue Jan 15 17:04:39 2013 +0100
Committer: Igor Galić <i.galic@brainsware.org>
Committed: Tue Jan 15 17:04:39 2013 +0100

----------------------------------------------------------------------
 cop/TrafficCop.cc  |   74 +++++++++++++++++++++++++++++++----------------
 lib/ts/lockfile.cc |   55 ++++++----------------------------
 2 files changed, 59 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/b20c7404/cop/TrafficCop.cc
----------------------------------------------------------------------
diff --git a/cop/TrafficCop.cc b/cop/TrafficCop.cc
index f31fd5d..b901145 100644
--- a/cop/TrafficCop.cc
+++ b/cop/TrafficCop.cc
@@ -45,7 +45,7 @@ union semun
 #endif  // linux check
 
 // For debugging, turn this on.
-// #define TRACE_LOG_COP 1
+#define TRACE_LOG_COP 1
 
 #define OPTIONS_MAX     32
 #define OPTIONS_LEN_MAX 1024
@@ -212,6 +212,7 @@ chown_file_to_admin_user(const char *file) {
     }
   }
 }
+
 static void
 sig_child(int signum)
 {
@@ -239,6 +240,43 @@ sig_child(int signum)
 }
 
 static void
+sig_term(int signum)
+{
+  pid_t pid = 0;
+  int status = 0;
+  int err;
+  pid_t holding_pid;
+
+  //killsig = SIGTERM;
+
+  cop_log_trace("Entering sig_term(%d)\n", signum);
+
+  // safely^W commit suicide.
+  cop_log_trace("Sending signal %d to entire group\n", signum);
+  killpg(0, signum);
+  
+  cop_log_trace("Waiting for children to exit.");
+
+  for (;;) {
+    pid = waitpid(WAIT_ANY, &status, WNOHANG);
+
+    if (pid <= 0) {
+      break;
+    }
+    // TSqa03086 - We can not log the child status signal from
+    //   the signal handler since syslog can deadlock.  Record
+    //   the pid and the status in a global for logging
+    //   next time through the event loop.  We will occasionally
+    //   lose some information if we get two sig childs in rapid
+    //   succession
+    child_pid = pid;
+    child_status = status;
+  }
+  cop_log_trace("Leaving sig_term(%d), exiting traffic_cop\n", signum);
+  exit(0);
+}
+
+static void
 #if defined(solaris)
 sig_fatal(int signum, siginfo_t * t, void *c)
 #else
@@ -1381,18 +1419,7 @@ check_programs()
   Lockfile manager_lf(manager_lockfile);
   err = manager_lf.Open(&holding_pid);
   chown_file_to_admin_user(manager_lockfile);
-#if defined(linux)
-  // if lockfile held, but process doesn't exist, killall and try again
-  if (err == 0) {
-    if (kill(holding_pid, 0) == -1) {
-      cop_log(COP_WARNING, "%s's lockfile is held, but its pid (%d) is missing;"
-              " killing all processes named '%s' and retrying\n", manager_binary, holding_pid,
manager_binary);
-      ink_killall(manager_binary, killsig);
-      sleep(1);                 // give signals a chance to be received
-      err = manager_lf.Open(&holding_pid);
-    }
-  }
-#endif
+
   if (err > 0) {
     // 'lockfile_open' returns the file descriptor of the opened
     // lockfile.  We need to close this before spawning the
@@ -1473,18 +1500,7 @@ check_programs()
 
     Lockfile server_lf(server_lockfile);
     err = server_lf.Open(&holding_pid);
-#if defined(linux)
-    // if lockfile held, but process doesn't exist, killall and try again
-    if (err == 0) {
-      if (kill(holding_pid, 0) == -1) {
-        cop_log(COP_WARNING, "%s's lockfile is held, but its pid (%d) is missing;"
-                " killing all processes named '%s' and retrying\n", server_binary, holding_pid,
server_binary);
-        ink_killall(server_binary, killsig);
-        sleep(1);               // give signals a chance to be received
-        err = server_lf.Open(&holding_pid);
-      }
-    }
-#endif
+
     if (err > 0) {
       server_lf.Close();
 
@@ -1673,6 +1689,14 @@ init_signals()
   struct sigaction action;
 
   cop_log_trace("Entering init_signals()\n");
+  // Handle the SIGTERM signal: We simply do the same as
+  // in sig_child..
+  action.sa_handler = sig_term;
+  sigemptyset(&action.sa_mask);
+  action.sa_flags = 0;
+
+  sigaction(SIGTERM, &action, NULL);
+
   // Handle the SIGCHLD signal. We simply reap all children that
   // die (which should only be spawned traffic_manager's).
   action.sa_handler = sig_child;

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/b20c7404/lib/ts/lockfile.cc
----------------------------------------------------------------------
diff --git a/lib/ts/lockfile.cc b/lib/ts/lockfile.cc
index 5cde39a..c2d34c4 100644
--- a/lib/ts/lockfile.cc
+++ b/lib/ts/lockfile.cc
@@ -205,54 +205,21 @@ static void
 lockfile_kill_internal(pid_t init_pid, int init_sig, pid_t pid, const char *pname, int sig)
 {
   int err;
-
-#if defined(linux)
-
-  pid_t *pidv;
-  int pidvcnt;
-
-  // Need to grab pname's pid vector before we issue any kill signals.
-  // Specifically, this prevents the race-condition in which
-  // traffic_manager spawns a new traffic_server while we still think
-  // we're killall'ing the old traffic_server.
-  if (pname) {
-    ink_killall_get_pidv_xmalloc(pname, &pidv, &pidvcnt);
-  }
-
-  if (init_sig > 0) {
-    kill(init_pid, init_sig);
-    // sleep for a bit and give time for the first signal to be
-    // delivered
-    sleep(1);
-  }
-
-  do {
-    if ((err = kill(pid, sig)) == 0) {
-      sleep(1);
-    }
-    if (pname && (pidvcnt > 0)) {
-      ink_killall_kill_pidv(pidv, pidvcnt, sig);
-      sleep(1);
-    }
-  } while ((err == 0) || ((err < 0) && (errno == EINTR)));
-
-  ats_free(pidv);
-
-#else
+  int status;
 
   if (init_sig > 0) {
     kill(init_pid, init_sig);
-    // sleep for a bit and give time for the first signal to be
-    // delivered
-    sleep(1);
+    // Wait for children to exit
+    do {
+      err = waitpid(-1, &status, WNOHANG);
+      if (err == -1) break;
+    } while(!WIFEXITED(status) && !WIFSIGNALED(status));
   }
 
   do {
     err = kill(pid, sig);
   } while ((err == 0) || ((err < 0) && (errno == EINTR)));
 
-#endif  // linux check
-
 }
 
 void
@@ -294,14 +261,12 @@ Lockfile::KillGroup(int sig, int initial_sig, const char *pname)
 
     if ((pid < 0) || (pid == getpid()))
       pid = holding_pid;
-    else
-      pid = -pid;
 
     if (pid != 0) {
-      // We kill the holding_pid instead of the process_group
-      // initially since there is no point trying to get core files
-      // from a group since the core file of one overwrites the core
-      // file of another one
+      // This way, we kill the process_group:
+      pid = -pid;
+      // In order to get core files from each process, please
+      // set your core_pattern appropriately.
       lockfile_kill_internal(holding_pid, initial_sig, pid, pname, sig);
     }
   }


Mime
View raw message