trafficserver-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a..@apache.org
Subject [trafficserver] branch master updated: TS-5056: Implement nonrecoverable error mechanism This closes #1224.
Date Fri, 06 Jan 2017 17:56:08 GMT
This is an automated email from the ASF dual-hosted git repository.

amc pushed a commit to branch master
in repository https://git-dual.apache.org/repos/asf/trafficserver.git

The following commit(s) were added to refs/heads/master by this push:
       new  24347df   TS-5056: Implement nonrecoverable error mechanism This closes #1224.
24347df is described below

commit 24347df553e91660b794af1079fbf69f986dc5b4
Author: Daniel Xu <dlxu2@yahoo.com>
AuthorDate: Wed Nov 16 14:21:27 2016 -0600

    TS-5056: Implement nonrecoverable error mechanism
    This closes #1224.
    
    Change `Emergency()` to terminate the current process with status
    code UNRECOVERABLE_EXIT.
    
    Also change traffic_manager to listen for the UNRECOVERABLE_EXIT
    status code. If heard, then TM will not try to restart TS from
    that point forward.
    
    This was designed so that traffic_server could call Emergency(..)
    in the event of a nonrecoverable error such as a bad config file.
    No amount of TS rebooting will fix a bad config, so we might as well
    have TM wait for human intervention.
    
    Note that if traffic_cop or traffic_manager calls Emergency(),
    nothing totally unexpected will happen since the only visible change
    from this patch is the status code.
---
 cmd/traffic_manager/traffic_manager.cc |  6 +++++-
 lib/ts/Diags.cc                        |  7 ++++++-
 lib/ts/ink_error.cc                    | 32 ++++++++++++++++++++++++++------
 lib/ts/ink_error.h                     | 10 ++++++++++
 mgmt/LocalManager.cc                   | 11 +++++++++++
 mgmt/LocalManager.h                    |  1 +
 6 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/cmd/traffic_manager/traffic_manager.cc b/cmd/traffic_manager/traffic_manager.cc
index 87f3bac..ce33925 100644
--- a/cmd/traffic_manager/traffic_manager.cc
+++ b/cmd/traffic_manager/traffic_manager.cc
@@ -799,7 +799,7 @@ main(int argc, const char **argv)
       break;
     }
 
-    if (lmgmt->run_proxy && !lmgmt->processRunning()) { /* Make sure we still
have a proxy up */
+    if (lmgmt->run_proxy && !lmgmt->processRunning() && lmgmt->proxy_recoverable)
{ /* Make sure we still have a proxy up */
       if (sleep_time) {
         mgmt_log("Relaunching proxy after %d sec...", sleep_time);
         millisleep(1000 * sleep_time); // we use millisleep instead of sleep because it doesnt
interfere with signals
@@ -814,6 +814,10 @@ main(int argc, const char **argv)
         just_started++;
       }
     } else { /* Give the proxy a chance to fire up */
+      if (!lmgmt->proxy_recoverable) {
+        mgmt_log("[main] Proxy is un-recoverable. Proxy will not be relaunched.\n");
+      }
+
       just_started++;
     }
 
diff --git a/lib/ts/Diags.cc b/lib/ts/Diags.cc
index 85b3465..da8c5f2 100644
--- a/lib/ts/Diags.cc
+++ b/lib/ts/Diags.cc
@@ -552,7 +552,12 @@ Diags::error_va(DiagsLevel level, const SourceLocation *loc, const char
*format_
     if (cleanup_func) {
       cleanup_func();
     }
-    ink_fatal_va(format_string, ap2);
+
+    // DL_Emergency means the process cannot recover from a reboot
+    if (level == DL_Emergency)
+      ink_emergency_va(format_string, ap2);
+    else
+      ink_fatal_va(format_string, ap2);
   }
 
   va_end(ap2);
diff --git a/lib/ts/ink_error.cc b/lib/ts/ink_error.cc
index 7d7bf1e..d38bafb 100644
--- a/lib/ts/ink_error.cc
+++ b/lib/ts/ink_error.cc
@@ -35,12 +35,12 @@
 
 */
 static void
-fatal_va(const char *fmt, va_list ap)
+fatal_va(const char *hdr, const char *fmt, va_list ap)
 {
   char msg[1024];
-  const size_t len = sizeof("FATAL: ") - 1;
+  const size_t len = strlen(hdr);
 
-  strncpy(msg, "FATAL: ", sizeof(msg));
+  strncpy(msg, hdr, sizeof(msg));
   vsnprintf(msg + len, sizeof(msg) - len, fmt, ap);
   msg[sizeof(msg) - 1] = 0;
 
@@ -51,7 +51,7 @@ fatal_va(const char *fmt, va_list ap)
 void
 ink_fatal_va(const char *fmt, va_list ap)
 {
-  fatal_va(fmt, ap);
+  fatal_va("Fatal: ", fmt, ap);
   ::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any.
 }
 
@@ -61,19 +61,39 @@ ink_fatal(const char *message_format, ...)
   va_list ap;
 
   va_start(ap, message_format);
-  fatal_va(message_format, ap);
+  fatal_va("Fatal: ", message_format, ap);
   va_end(ap);
 
   ::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any.
 }
 
 void
+ink_emergency_va(const char *fmt, va_list ap)
+{
+  fatal_va("Emergency: ", fmt, ap);
+  ::exit(UNRECOVERABLE_EXIT);
+}
+
+void
+ink_emergency(const char *message_format, ...)
+{
+  va_list ap;
+
+  va_start(ap, message_format);
+  ink_emergency_va(message_format, ap);
+  // Should never reach here since ink_emergency_va calls exit()
+  va_end(ap);
+
+  ::exit(UNRECOVERABLE_EXIT);
+}
+
+void
 ink_abort(const char *message_format, ...)
 {
   va_list ap;
 
   va_start(ap, message_format);
-  fatal_va(message_format, ap);
+  fatal_va("Fatal: ", message_format, ap);
   va_end(ap);
 
   abort();
diff --git a/lib/ts/ink_error.h b/lib/ts/ink_error.h
index d0b7651..34309f9 100644
--- a/lib/ts/ink_error.h
+++ b/lib/ts/ink_error.h
@@ -36,6 +36,16 @@
 #include "ts/ink_platform.h"
 #include "ts/ink_apidefs.h"
 
+// This magic exit code is used to signal that the crashing process cannot
+// be recovered from a restart of said process
+//
+// Originally, this was intended to be used as a backchannel mechanism whereby
+// traffic_server can tell traffic_manager via an exit code to stop trying to restart
+// traffic_server b/c (for example) traffic_server has a bad config file
+#define UNRECOVERABLE_EXIT 33
+
+void ink_emergency_va(const char *fmt, va_list ap) TS_NORETURN;
+void ink_emergency(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
 void ink_fatal_va(const char *message_format, va_list ap) TS_NORETURN;
 void ink_fatal(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
 void ink_abort(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
diff --git a/mgmt/LocalManager.cc b/mgmt/LocalManager.cc
index 29fd3ca..835c0f4 100644
--- a/mgmt/LocalManager.cc
+++ b/mgmt/LocalManager.cc
@@ -24,6 +24,7 @@
 #include "ts/ink_platform.h"
 #include "ts/ink_sock.h"
 #include "ts/ink_file.h"
+#include "ts/ink_error.h"
 #include "MgmtUtils.h"
 #include "ts/I_Layout.h"
 #include "LocalManager.h"
@@ -185,6 +186,7 @@ LocalManager::LocalManager(bool proxy_on) : BaseManager(), run_proxy(proxy_on),
   syslog_facility = 0;
 
   ccom                      = nullptr;
+  proxy_recoverable         = true;
   proxy_started_at          = -1;
   proxy_launch_count        = 0;
   manager_started_at        = time(nullptr);
@@ -493,6 +495,15 @@ LocalManager::pollMgmtProcessServer()
           if (WIFSIGNALED(estatus)) {
             int sig = WTERMSIG(estatus);
             mgmt_log("[LocalManager::pollMgmtProcessServer] Server Process terminated due
to Sig %d: %s\n", sig, strsignal(sig));
+          } else if (WIFEXITED(estatus)) {
+            int return_code = WEXITSTATUS(estatus);
+
+            // traffic_server's exit code will be UNRECOVERABLE_EXIT if it calls
+            // ink_emergency() or ink_emergency_va(). The call signals that traffic_server
+            // cannot be recovered with a reboot. In other words, catastrophic failure.
+            if (return_code == UNRECOVERABLE_EXIT) {
+              proxy_recoverable = false;
+            }
           }
 
           if (lmgmt->run_proxy) {
diff --git a/mgmt/LocalManager.h b/mgmt/LocalManager.h
index 6d88423..f82e859 100644
--- a/mgmt/LocalManager.h
+++ b/mgmt/LocalManager.h
@@ -91,6 +91,7 @@ public:
   bool clusterOk();
 
   volatile bool run_proxy;
+  volatile bool proxy_recoverable; // false if traffic_server cannot recover with a reboot
   volatile time_t manager_started_at;
   volatile time_t proxy_started_at;
   volatile int proxy_launch_count;

-- 
To stop receiving notification emails like this one, please contact
['"commits@trafficserver.apache.org" <commits@trafficserver.apache.org>'].

Mime
View raw message