kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/2] incubator-kudu git commit: KUDU-598. Fix a race in concurrent btree which could segfault
Date Thu, 21 Jan 2016 21:54:58 GMT
KUDU-598. Fix a race in concurrent btree which could segfault

This adds a regression test and fix for KUDU-598, which caused
an occasional segfault during ConcurrentBTreeTest.TestConcurrentInsert.

The issue was that, while inserting a new element into a leaf node,
we'd briefly end up with a NULL (or uninitialized) value pointer
associated with a valid key. Because the size of the value is
stored at the pointed-to location (rather than with the pointer)
we accidentally derefenced it before checking that our read of the
leaf node was non-racy.

The test works by adding a new 'race point' in the insert code
path, and adding a new test which exercises of the race points.
With the new race test, we hit the SEGV almost every time. With
the patch, I was able to loop the new test 1000 times and the
old non-racy test 10000 times on GCE without failure.

Change-Id: I0c728a1d8014342bbc06b713a8ffee82ca4aa6bd
Reviewed-on: http://gerrit.cloudera.org:8080/1859
Tested-by: Internal Jenkins
Reviewed-by: Adar Dembo <adar@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/ed01385a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/ed01385a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/ed01385a

Branch: refs/heads/master
Commit: ed01385a7bbdc40161f93cd42da7f59301c2c06e
Parents: 0d8da08
Author: Todd Lipcon <todd@apache.org>
Authored: Thu Jan 21 09:29:30 2016 -0800
Committer: Todd Lipcon <todd@apache.org>
Committed: Thu Jan 21 20:42:16 2016 +0000

----------------------------------------------------------------------
 src/kudu/tablet/cbtree-test.cc     | 26 ++++++++++++--
 src/kudu/tablet/concurrent_btree.h | 63 ++++++++++++++++++++-------------
 2 files changed, 61 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ed01385a/src/kudu/tablet/cbtree-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/cbtree-test.cc b/src/kudu/tablet/cbtree-test.cc
index 44d9610..0f42e1b 100644
--- a/src/kudu/tablet/cbtree-test.cc
+++ b/src/kudu/tablet/cbtree-test.cc
@@ -69,6 +69,9 @@ class TestCBTree : public KuduTest {
               InsertInLeaf(&lnode, &arena, key, val));
   }
 
+  template<class Traits>
+  void DoTestConcurrentInsert();
+
 };
 
 // Ensure that the template magic to make the nodes sized
@@ -155,6 +158,13 @@ struct SmallFanoutTraits : public BTreeTraits {
   static const size_t leaf_node_size = 92;
 };
 
+// Enables yield() calls at interesting points of the btree
+// implementation to ensure that we are still correct even
+// with adversarial scheduling.
+struct RacyTraits : public SmallFanoutTraits {
+  static const size_t debug_raciness = 100;
+};
+
 void MakeKey(char *kbuf, size_t len, int i) {
   snprintf(kbuf, len, "key_%d%d", i % 10, i / 10);
 }
@@ -400,7 +410,17 @@ TEST_F(TestCBTree, TestVersionLockConcurrent) {
 // Each thread inserts a number of elements and then verifies that it can
 // read them back.
 TEST_F(TestCBTree, TestConcurrentInsert) {
-  gscoped_ptr<CBTree<SmallFanoutTraits> > tree;
+  DoTestConcurrentInsert<SmallFanoutTraits>();
+}
+
+// Same, but with a tree that tries to provoke race conditions.
+TEST_F(TestCBTree, TestRacyConcurrentInsert) {
+  DoTestConcurrentInsert<RacyTraits>();
+}
+
+template<class TraitsClass>
+void TestCBTree::DoTestConcurrentInsert() {
+  gscoped_ptr<CBTree<TraitsClass> > tree;
 
   int num_threads = 16;
   int ins_per_thread = 30;
@@ -417,7 +437,7 @@ TEST_F(TestCBTree, TestConcurrentInsert) {
 
   for (int i = 0; i < num_threads; i++) {
     threads.push_back(new boost::thread(
-                        InsertAndVerify<SmallFanoutTraits>,
+                        InsertAndVerify<TraitsClass>,
                         &go_barrier,
                         &done_barrier,
                         &tree,
@@ -432,7 +452,7 @@ TEST_F(TestCBTree, TestConcurrentInsert) {
   // on areas of the key space diminishes.
 
   for (int trial = 0; trial < n_trials; trial++) {
-    tree.reset(new CBTree<SmallFanoutTraits>());
+    tree.reset(new CBTree<TraitsClass>());
     go_barrier.wait();
 
     done_barrier.wait();

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ed01385a/src/kudu/tablet/concurrent_btree.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/concurrent_btree.h b/src/kudu/tablet/concurrent_btree.h
index f867284..916b833 100644
--- a/src/kudu/tablet/concurrent_btree.h
+++ b/src/kudu/tablet/concurrent_btree.h
@@ -92,6 +92,18 @@ inline void PrefetchMemory(const T *addr) {
   }
 }
 
+// Utility function that, when Traits::debug_raciness is non-zero
+// (i.e only in debug code), will spin for some amount of time
+// related to that setting.
+// This can be used when trying to debug race conditions, but
+// will compile away in production code.
+template<class Traits>
+void DebugRacyPoint() {
+  if (Traits::debug_raciness > 0) {
+    boost::detail::yield(Traits::debug_raciness);
+  }
+}
+
 template<class Traits> class NodeBase;
 template<class Traits> class InternalNode;
 template<class Traits> class LeafNode;
@@ -702,9 +714,10 @@ class LeafNode : public NodeBase<Traits> {
     this->SetInserting();
 
     // The following inserts should always succeed because we
-    // verified space_available above
+    // verified that there is space available above.
     num_entries_++;
     InsertInSliceArray(keys_, num_entries_, key, idx, arena);
+    DebugRacyPoint<Traits>();
     InsertInSliceArray(vals_, num_entries_, val, idx, arena);
 
     return INSERT_SUCCESS;
@@ -739,12 +752,12 @@ class LeafNode : public NodeBase<Traits> {
   //
   // NOTE: the value slice may include an *invalid pointer*, not
   // just invalid data, so any readers should check for conflicts
-  // before accessing any referred-to data in the value slice.
+  // before accessing the value slice.
   // The key, on the other hand, will always be a valid pointer, but
   // may be invalid data.
-  void Get(size_t idx, Slice *k, Slice *v) const {
+  void Get(size_t idx, Slice *k, ValueSlice *v) const {
     *k = keys_[idx].as_slice();
-    *v = vals_[idx].as_slice();
+    *v = vals_[idx];
   }
 
   // Truncates the node, removing entries from the right to reduce
@@ -855,10 +868,11 @@ class PreparedMutation {
   // has the same size as the original data.
   Slice current_mutable_value() {
     CHECK(prepared());
-    Slice k, v;
+    Slice k;
+    ValueSlice v;
     leaf_->Get(idx_, &k, &v);
     leaf_->SetInserting();
-    return v;
+    return v.as_slice();
   }
 
   // Accessors
@@ -990,7 +1004,8 @@ class CBTree {
       retry_in_leaf:
       {
         GetResult ret;
-        Slice key_in_node, val_in_node;
+        Slice key_in_node;
+        ValueSlice val_in_node;
         bool exact;
         size_t idx = leaf->Find(key, &exact);
         DebugRacyPoint();
@@ -999,13 +1014,7 @@ class CBTree {
           ret = GET_NOT_FOUND;
         } else {
           leaf->Get(idx, &key_in_node, &val_in_node);
-          *buf_len = val_in_node.size();
-
-          if (PREDICT_FALSE(val_in_node.size() > in_buf_len)) {
-            ret = GET_TOO_BIG;
-          } else {
-            ret = GET_SUCCESS;
-          }
+          ret = GET_SUCCESS;
         }
 
         // Got some kind of result, but may be based on racy data.
@@ -1017,8 +1026,18 @@ class CBTree {
           version = new_version;
           goto retry_in_leaf;
         }
+
+        // If we found a matching key earlier, and the read of the node
+        // wasn't racy, we can safely work with the ValueSlice.
         if (ret == GET_SUCCESS) {
-          memcpy(buf, val_in_node.data(), val_in_node.size());
+          Slice val = val_in_node.as_slice();
+          *buf_len = val.size();
+
+          if (PREDICT_FALSE(val.size() > in_buf_len)) {
+            ret = GET_TOO_BIG;
+          } else {
+            memcpy(buf, val.data(), val.size());
+          }
         }
         return ret;
       }
@@ -1179,16 +1198,8 @@ class CBTree {
     return node.leaf_node_ptr();
   }
 
-
-  // Utility function that, when Traits::debug_raciness is non-zero
-  // (i.e only in debug code), will spin for some amount of time
-  // related to that setting.
-  // This can be used when trying to debug race conditions, but
-  // will compile away in production code.
   void DebugRacyPoint() const {
-    if (Traits::debug_raciness > 0) {
-      boost::detail::yield(Traits::debug_raciness);
-    }
+    btree::DebugRacyPoint<Traits>();
   }
 
   // Dump the tree.
@@ -1616,7 +1627,9 @@ class CBTreeIterator {
 
   void GetCurrentEntry(Slice *key, Slice *val) const {
     DCHECK(seeked_);
-    leaf_to_scan_->Get(idx_in_leaf_, key, val);
+    ValueSlice val_slice;
+    leaf_to_scan_->Get(idx_in_leaf_, key, &val_slice);
+    *val = val_slice.as_slice();
   }
 
   Slice GetCurrentKey() const {


Mime
View raw message