kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From danburk...@apache.org
Subject [1/3] incubator-kudu git commit: KUDU-1290. Fix CHECK failures when some CPUs are disabled
Date Fri, 22 Jan 2016 17:14:31 GMT
Repository: incubator-kudu
Updated Branches:
  refs/heads/master 3d4e0ce8a -> a65bbcc8e


KUDU-1290. Fix CHECK failures when some CPUs are disabled

This adds a new function base::MaxCPUID() which uses
/sys/devices/system/cpu/present to determine which CPUs are present on the
system. This is an alternative to base::NumCPUs() which doesn't count any CPUs
which have been administratively disabled.

The few places where we use CPU ID to index into a collection are changed
to use this new API.

I verified that, if I disabled one of my CPUs, the master crashed a few minutes
after startup without the patch. With the patch, it seems stable.

Change-Id: I7959e87aba4a694b60ee36adddfd32fef631d8aa
Reviewed-on: http://gerrit.cloudera.org:8080/1864
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/02df1660
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/02df1660
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/02df1660

Branch: refs/heads/master
Commit: 02df16604babe92d030c9b930e73bc7c1dfac1fa
Parents: 3d4e0ce
Author: Todd Lipcon <todd@apache.org>
Authored: Thu Jan 21 13:44:12 2016 -0800
Committer: Todd Lipcon <todd@apache.org>
Committed: Fri Jan 22 04:39:46 2016 +0000

----------------------------------------------------------------------
 src/kudu/cfile/bloomfile.cc |  2 +-
 src/kudu/gutil/sysinfo.cc   | 75 +++++++++++++++++++++++++++++++++-------
 src/kudu/gutil/sysinfo.h    | 10 ++++++
 src/kudu/util/locks.h       |  2 +-
 4 files changed, 74 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/02df1660/src/kudu/cfile/bloomfile.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bloomfile.cc b/src/kudu/cfile/bloomfile.cc
index 6d2dcf0..b3059bc 100644
--- a/src/kudu/cfile/bloomfile.cc
+++ b/src/kudu/cfile/bloomfile.cc
@@ -194,7 +194,7 @@ Status BloomFileReader::InitOnce() {
   // Ugly hack: create a per-cpu iterator.
   // Instead this should be threadlocal, or allow us to just
   // stack-allocate these things more smartly!
-  int n_cpus = base::NumCPUs();
+  int n_cpus = base::MaxCPUIndex() + 1;
   for (int i = 0; i < n_cpus; i++) {
     index_iters_.push_back(
       IndexTreeIterator::Create(reader_.get(), validx_root));

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/02df1660/src/kudu/gutil/sysinfo.cc
----------------------------------------------------------------------
diff --git a/src/kudu/gutil/sysinfo.cc b/src/kudu/gutil/sysinfo.cc
index a222fbb..73afa27 100644
--- a/src/kudu/gutil/sysinfo.cc
+++ b/src/kudu/gutil/sysinfo.cc
@@ -53,8 +53,9 @@
 #include <shlwapi.h>          // for SHGetValueA()
 #include <tlhelp32.h>         // for Module32First()
 #endif
-#include "kudu/gutil/sysinfo.h"
 #include "kudu/gutil/dynamic_annotations.h"   // for RunningOnValgrind
+#include "kudu/gutil/macros.h"
+#include "kudu/gutil/sysinfo.h"
 #include "kudu/gutil/walltime.h"
 #include <glog/logging.h>
 
@@ -73,6 +74,7 @@ namespace base {
 
 static double cpuinfo_cycles_per_second = 1.0;  // 0.0 might be dangerous
 static int cpuinfo_num_cpus = 1;  // Conservative guess
+static int cpuinfo_max_cpu_index = -1;
 
 void SleepForNanoseconds(int64_t nanoseconds) {
   // Sleep for nanosecond duration
@@ -103,25 +105,59 @@ static int64 EstimateCyclesPerSecond(const int estimate_time_ms) {
 
 // ReadIntFromFile is only called on linux and cygwin platforms.
 #if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
-// Helper function for reading an int from a file. Returns true if successful
-// and the memory location pointed to by value is set to the value read.
-static bool ReadIntFromFile(const char *file, int *value) {
+
+// Slurp a file with a single read() call into 'buf'. This is only safe to use on small
+// files in places like /proc where we are guaranteed not to get a partial read.
+// Any remaining bytes in the buffer are zeroed.
+//
+// 'buflen' must be more than large enough to hold the whole file, or else this will
+// issue a FATAL error.
+static bool SlurpSmallTextFile(const char* file, char* buf, int buflen) {
   bool ret = false;
   int fd = open(file, O_RDONLY);
   if (fd == -1) return ret;
-  char line[1024];
-  char* err;
-  memset(line, '\0', sizeof(line));
-  if (read(fd, line, sizeof(line) - 1) != -1) {
-    const int temp_value = strtol(line, &err, 10);
-    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
-      *value = temp_value;
-      ret = true;
-    }
+
+  memset(buf, '\0', buflen);
+  int n = read(fd, buf, buflen - 1);
+  CHECK_NE(n, buflen - 1) << "buffer of len " << buflen << " not large
enough to store "
+                          << "contents of " << file;
+  if (n > 0) {
+    ret = true;
   }
+
   close(fd);
   return ret;
 }
+
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+static bool ReadIntFromFile(const char *file, int *value) {
+  char line[1024];
+  if (!SlurpSmallTextFile(file, line, arraysize(line))) {
+    return false;
+  }
+  char* err;
+  const int temp_value = strtol(line, &err, 10);
+  if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+    *value = temp_value;
+    return true;
+  }
+  return false;
+}
+
+static int ReadMaxCPUIndex() {
+  char buf[1024];
+  CHECK(SlurpSmallTextFile("/sys/devices/system/cpu/present", buf, arraysize(buf)));
+  // 'buf' should contain a string like '0-7'.
+  CHECK_EQ(0, memcmp(buf, "0-", 2)) << "bad list of possible CPUs: " << buf;
+
+  char* max_cpu_str = &buf[2];
+  char* err;
+  int val = strtol(max_cpu_str, &err, 10);
+  CHECK(*err == '\n' || *err == '\0') << "unable to parse max CPU index from: " <<
buf;
+  return val;
+}
+
 #endif
 
 // WARNING: logging calls back to InitializeSystemInfo() so it must
@@ -268,6 +304,7 @@ static void InitializeSystemInfo() {
   if (num_cpus > 0) {
     cpuinfo_num_cpus = num_cpus;
   }
+  cpuinfo_max_cpu_index = ReadMaxCPUIndex();
 
 #elif defined __FreeBSD__
   // For this sysctl to work, the machine must be configured without
@@ -346,6 +383,13 @@ static void InitializeSystemInfo() {
   // Generic cycles per second counter
   cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
 #endif
+
+  // On platforms where we can't determine the max CPU index, just use the
+  // number of CPUs. This might break if CPUs are taken offline, but
+  // better than a wild guess.
+  if (cpuinfo_max_cpu_index < 0) {
+    cpuinfo_max_cpu_index = cpuinfo_num_cpus - 1;
+  }
 }
 
 double CyclesPerSecond(void) {
@@ -358,4 +402,9 @@ int NumCPUs(void) {
   return cpuinfo_num_cpus;
 }
 
+int MaxCPUIndex(void) {
+  InitializeSystemInfo();
+  return cpuinfo_max_cpu_index;
+}
+
 } // namespace base

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/02df1660/src/kudu/gutil/sysinfo.h
----------------------------------------------------------------------
diff --git a/src/kudu/gutil/sysinfo.h b/src/kudu/gutil/sysinfo.h
index 2ff7d1e..ec3abe7 100644
--- a/src/kudu/gutil/sysinfo.h
+++ b/src/kudu/gutil/sysinfo.h
@@ -33,8 +33,18 @@
 
 namespace base {
 
+// Return the number of online CPUs. This is computed and cached the first time this or
+// NumCPUs() is called, so does not reflect any CPUs enabled or disabled at a later
+// point in time.
+//
+// Note that, if not all CPUs are online, this may return a value lower than the maximum
+// value of sched_getcpu().
 extern int NumCPUs();
 
+// Return the maximum CPU index that may be returned by sched_getcpu(). For example, on
+// an 8-core machine, this will return '7' even if some of the CPUs have been disabled.
+extern int MaxCPUIndex();
+
 void SleepForNanoseconds(int64_t nanoseconds);
 void SleepForMilliseconds(int64_t milliseconds);
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/02df1660/src/kudu/util/locks.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/locks.h b/src/kudu/util/locks.h
index 769143f..7bbd8e8 100644
--- a/src/kudu/util/locks.h
+++ b/src/kudu/util/locks.h
@@ -159,7 +159,7 @@ class percpu_rwlock {
  public:
   percpu_rwlock() {
     errno = 0;
-    n_cpus_ = base::NumCPUs();
+    n_cpus_ = base::MaxCPUIndex() + 1;
     CHECK_EQ(errno, 0) << ErrnoToString(errno);
     CHECK_GT(n_cpus_, 0);
     locks_ = new padded_lock[n_cpus_];


Mime
View raw message