trafficserver-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bc...@apache.org
Subject [trafficserver] 03/06: TS-4866: Makes traffic_cop killing optional
Date Tue, 27 Sep 2016 17:30:29 GMT
This is an automated email from the ASF dual-hosted git repository.

bcall pushed a commit to branch 7.0.x
in repository https://git-dual.apache.org/repos/asf/trafficserver.git

commit b8e6518c0258299e9f5bf770d413d22f3157213e
Author: Leif Hedstrom <zwoop@apache.org>
AuthorDate: Mon Sep 19 16:19:25 2016 -0600

    TS-4866: Makes traffic_cop killing optional
    
    This adds a new configuration option, proxy.config.cop.active_health_checks:
    
     0 - traffic_cop is not allowed to kill any processes
     1 - Only traffic_manager can be killed on failed health checks
     2 - Only traffic_server can be killed on failed health checks
     3 - traffic_server and traffic_manager can be killed on failure (default)
    
    (cherry picked from commit 6c609d4acbf9de7524527f720b68dcd62c982812)
---
 cmd/traffic_cop/traffic_cop.cc              | 74 +++++++++++++++++++++++------
 doc/admin-guide/files/records.config.en.rst | 16 +++++++
 mgmt/RecordsConfig.cc                       |  2 +
 3 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/cmd/traffic_cop/traffic_cop.cc b/cmd/traffic_cop/traffic_cop.cc
index edaee09..16b90e7 100644
--- a/cmd/traffic_cop/traffic_cop.cc
+++ b/cmd/traffic_cop/traffic_cop.cc
@@ -113,6 +113,15 @@ static int server_failures  = 0;
 static int server_not_found = 0;
 static int init_sleep_time  = cop_sleep_time; // 10 sec
 
+/* This gets setup when loading the configuration */
+typedef enum {
+  COP_KILL_NONE    = 0,
+  COP_KILL_SERVER  = 1,
+  COP_KILL_MANAGER = 2,
+} ActiveHealthChecks;
+
+static int active_health_checks = COP_KILL_SERVER | COP_KILL_MANAGER;
+
 // traffic_manager flap detection
 #define MANAGER_FLAP_DETECTION 1
 #if defined(MANAGER_FLAP_DETECTION)
@@ -600,6 +609,7 @@ config_reload_records()
   struct stat stat_buf;
   static time_t last_mod = 0;
   char log_filename[PATH_NAME_MAX];
+  int tmp;
 
   ats_scoped_str bindir;
   ats_scoped_str logdir;
@@ -651,6 +661,26 @@ config_reload_records()
   config_read_int("proxy.config.cluster.rsport", &rs_port, true);
   config_read_int("proxy.config.cop.init_sleep_time", &init_sleep_time, true);
 
+  config_read_int("proxy.config.cop.active_health_checks", &tmp, true);
+  // 0 == No servers are killed
+  // 1 == Only traffic_manager can be killed on failure
+  // 2 == Only traffic_server can be killed on failure
+  // 3 == Any failing healthchecks can cause restarts (default)
+  switch (tmp) {
+  case 0:
+    active_health_checks = COP_KILL_NONE;
+    break;
+  case 1:
+    active_health_checks = COP_KILL_MANAGER;
+    break;
+  case 2:
+    active_health_checks = COP_KILL_SERVER;
+    break;
+  default:
+    active_health_checks = COP_KILL_SERVER | COP_KILL_MANAGER;
+    break;
+  }
+
 #if defined(linux)
   // TS-1075 : auto-port ::connect DoS on high traffic linux systems
   config_read_int("proxy.config.cop.source_port", &source_port, true);
@@ -1245,8 +1275,12 @@ heartbeat_manager()
 
     if (manager_failures > 1) {
       manager_failures = 0;
-      cop_log(COP_WARNING, "killing manager\n");
-      safe_kill(manager_lockfile, manager_binary, true);
+      if (active_health_checks & COP_KILL_MANAGER) {
+        cop_log(COP_WARNING, "killing manager\n");
+        safe_kill(manager_lockfile, manager_binary, true);
+      } else {
+        cop_log(COP_WARNING, "would have killed manager, but configuration said not to\n");
+      }
     }
     cop_log_trace("Leaving heartbeat_manager() --> %d\n", err);
     return err;
@@ -1280,19 +1314,22 @@ heartbeat_server()
     // we kill the server.
     if (server_failures > 1) {
       server_failures = 0;
-      cop_log(COP_WARNING, "killing server\n");
-
       // TSqa02622: Change the ALRM signal handler while
       //   trying to kill the process since if a core
       //   is being written, it could take a long time
       //   Set a new alarm so that we can print warnings
       //   if it is taking too long to kill the server
       //
-      safe_kill(server_lockfile, server_binary, false);
-      // Allow a configurable longer sleep init time
-      // to load very large remap files
-      cop_log_trace("performing additional sleep for %d sec during init", init_sleep_time);
-      millisleep(init_sleep_time * 1000);
+      if (active_health_checks & COP_KILL_SERVER) {
+        cop_log(COP_WARNING, "killing server\n");
+        safe_kill(server_lockfile, server_binary, false);
+        // Allow a configurable longer sleep init time
+        // to load very large remap files
+        cop_log_trace("performing additional sleep for %d sec during init", init_sleep_time);
+        millisleep(init_sleep_time * 1000);
+      } else {
+        cop_log(COP_WARNING, "would have killed server, but configurations said not to\n");
+      }
     }
   } else {
     if (server_failures) {
@@ -1499,11 +1536,20 @@ check_memory()
       // 5:     0       0      low     (bad)
       if ((swapsize != 0 && swapfree < check_memory_min_swapfree_kb) || (swapsize
== 0 && memfree < check_memory_min_memfree_kb)) {
         cop_log(COP_WARNING, "Low memory available (swap: %dkB, mem: %dkB)\n", (int)swapfree,
(int)memfree);
-        cop_log(COP_WARNING, "Killing '%s' and '%s'\n", manager_binary, server_binary);
-        manager_failures = 0;
-        safe_kill(manager_lockfile, manager_binary, true);
-        server_failures = 0;
-        safe_kill(server_lockfile, server_binary, false);
+        if (active_health_checks & COP_KILL_MANAGER) {
+          cop_log(COP_WARNING, "Killing '%s'\n", manager_binary);
+          manager_failures = 0;
+          safe_kill(manager_lockfile, manager_binary, true);
+        } else {
+          cop_log(COP_WARNING, "would have killed manager due to low memory, but configurations
sayd not to\n");
+        }
+        if (active_health_checks & COP_KILL_SERVER) {
+          cop_log(COP_WARNING, "Killing '%s'\n", server_binary);
+          server_failures = 0;
+          safe_kill(server_lockfile, server_binary, false);
+        } else {
+          cop_log(COP_WARNING, "would have killed server due to low memory, but configurations
sayd not to\n");
+        }
       }
     } else {
       cop_log(COP_WARNING, "Unable to open /proc/meminfo: %s\n", strerror(errno));
diff --git a/doc/admin-guide/files/records.config.en.rst b/doc/admin-guide/files/records.config.en.rst
index 8298d91..1c1859b 100644
--- a/doc/admin-guide/files/records.config.en.rst
+++ b/doc/admin-guide/files/records.config.en.rst
@@ -262,6 +262,22 @@ System Variables
    this applies only during startup of Traffic Server and does not apply to the run
    time heartbeat checking.
 
+.. ts:cv:: CONFIG proxy.config.cop.active_health_checks INT 3
+
+   Specifies which, if any, of :program:`traffic_server` and
+   :program:`traffic_manager` that :program:`traffic_cop` is allowed to kill
+   in the event of failed health checks. The possible values are:
+
+   ===== ======================================================================
+   Value Description
+   ===== ======================================================================
+   ``0`` :program:`traffic_cop` is not allowed to kill any processes.
+   ``1`` Only :program:`traffic_manager` can be killed on failed health checks.
+   ``2`` Only :program:`traffic_server` can be killed on failed health checks.
+   ``3`` :program:`traffic_server` and :program:`traffic_manager` can be killed
+         on failures (default).
+   ===== ======================================================================
+
 .. ts:cv:: CONFIG proxy.config.output.logfile  STRING traffic.out
 
    The name and location of the file that contains warnings, status messages, and error messages
produced by the Traffic Server
diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc
index 663b6d7..df67e7c 100644
--- a/mgmt/RecordsConfig.cc
+++ b/mgmt/RecordsConfig.cc
@@ -85,6 +85,8 @@ static const RecordElement RecordsConfig[] =
   ,                             // needed by traffic_cop
   {RECT_CONFIG, "proxy.config.cop.init_sleep_time", RECD_INT, "0", RECU_NULL, RR_NULL, RECC_NULL,
"[0-900]", RECA_NULL}
   ,
+  {RECT_CONFIG, "proxy.config.cop.active_health_checks", RECD_INT, "3", RECU_NULL, RR_NULL,
RECC_NULL, "[0-3]", RECA_NULL}
+  ,
   //# 0 = disable (seconds)
   {RECT_CONFIG, "proxy.config.dump_mem_info_frequency", RECD_INT, "0", RECU_NULL, RR_NULL,
RECC_NULL, NULL, RECA_NULL}
   ,

-- 
To stop receiving notification emails like this one, please contact
"commits@trafficserver.apache.org" <commits@trafficserver.apache.org>.

Mime
View raw message