kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [2/2] kudu git commit: KUDU-1038 Deleting a tablet should also delete its log recovery directory, if any
Date Mon, 11 Jun 2018 19:29:06 GMT
KUDU-1038 Deleting a tablet should also delete its log recovery directory, if any

This fix ensures successful bootstrap of a failed tablet replica, in the case
when its WAL segments are missing/deleted. The following scenario is tested:
1. A log segment is deleted from one of the Tablet Replicas.

2. On server restart, the replica in step 1. enters a failed state.
( Without this fix, due to the presence of log recovery directory (with deleted
log segments), the replica remains in a FAILED state and fails to bootstrap )

3. This fix deletes the log recovery directory when the tablet data is deleted,
as a result of which the failed replica successfully bootstraps.

Change-Id: If3f017caf41ac4e00bf89421d6e73924b9fdcbd0
Reviewed-on: http://gerrit.cloudera.org:8080/10591
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mpercy@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/382ee120
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/382ee120
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/382ee120

Branch: refs/heads/master
Commit: 382ee120620484cdc800a15bedbf59573b6606ac
Parents: 112a60f
Author: anupama <anupama.gupta@cloudera.com>
Authored: Mon Jun 4 01:12:41 2018 -0700
Committer: Mike Percy <mpercy@apache.org>
Committed: Mon Jun 11 19:28:46 2018 +0000

----------------------------------------------------------------------
 src/kudu/consensus/log.cc                       | 35 +++++++
 src/kudu/consensus/log.h                        |  4 +
 src/kudu/integration-tests/ts_recovery-itest.cc | 98 +++++++++++++++++++-
 src/kudu/tablet/tablet_bootstrap.cc             | 39 +-------
 src/kudu/tserver/ts_tablet_manager.cc           |  1 +
 5 files changed, 139 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/382ee120/src/kudu/consensus/log.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log.cc b/src/kudu/consensus/log.cc
index 12898b9..845b446 100644
--- a/src/kudu/consensus/log.cc
+++ b/src/kudu/consensus/log.cc
@@ -103,6 +103,11 @@ DEFINE_bool(log_inject_latency, false,
             "If true, injects artificial latency in log sync operations. "
             "Advanced option. Use at your own risk -- has a negative effect "
             "on performance for obvious reasons!");
+
+DEFINE_bool(skip_remove_old_recovery_dir, false,
+            "Skip removing WAL recovery dir after startup. (useful for debugging)");
+TAG_FLAG(skip_remove_old_recovery_dir, hidden);
+
 TAG_FLAG(log_inject_latency, unsafe);
 TAG_FLAG(log_inject_latency, runtime);
 
@@ -985,6 +990,36 @@ Status Log::DeleteOnDiskData(FsManager* fs_manager, const string&
tablet_id) {
   return Status::OK();
 }
 
+Status Log::RemoveRecoveryDirIfExists(FsManager* fs_manager, const string& tablet_id)
{
+  string recovery_path = fs_manager->GetTabletWalRecoveryDir(tablet_id);
+  const auto kLogPrefix = Substitute("T $0 P $1: ", tablet_id, fs_manager->uuid());
+  if (!fs_manager->Exists(recovery_path)) {
+    VLOG(1) << kLogPrefix << "Tablet WAL recovery dir " << recovery_path
<<
+            " does not exist.";
+    return Status::OK();
+  }
+
+  VLOG(1) << kLogPrefix << "Preparing to delete log recovery files and directory
" << recovery_path;
+
+  string tmp_path = Substitute("$0-$1", recovery_path, GetCurrentTimeMicros());
+  VLOG(1) << kLogPrefix << "Renaming log recovery dir from "  << recovery_path
+          << " to " << tmp_path;
+  RETURN_NOT_OK_PREPEND(fs_manager->env()->RenameFile(recovery_path, tmp_path),
+                        Substitute("Could not rename old recovery dir from: $0 to: $1",
+                                   recovery_path, tmp_path));
+
+  if (FLAGS_skip_remove_old_recovery_dir) {
+    LOG(INFO) << kLogPrefix << "--skip_remove_old_recovery_dir enabled. NOT deleting
" << tmp_path;
+    return Status::OK();
+  }
+  VLOG(1) << kLogPrefix << "Deleting all files from renamed log recovery directory
" << tmp_path;
+  RETURN_NOT_OK_PREPEND(fs_manager->env()->DeleteRecursively(tmp_path),
+                        "Could not remove renamed recovery dir " + tmp_path);
+  VLOG(1) << kLogPrefix << "Completed deletion of old log recovery files and
directory "
+          << tmp_path;
+  return Status::OK();
+}
+
 Status Log::PreAllocateNewSegment() {
   TRACE_EVENT1("log", "PreAllocateNewSegment", "file", next_segment_path_);
   CHECK_EQ(allocation_state(), kAllocationInProgress);

http://git-wip-us.apache.org/repos/asf/kudu/blob/382ee120/src/kudu/consensus/log.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log.h b/src/kudu/consensus/log.h
index f10c5e3..5abbf3a 100644
--- a/src/kudu/consensus/log.h
+++ b/src/kudu/consensus/log.h
@@ -140,6 +140,10 @@ class Log : public RefCountedThreadSafe<Log> {
   // REQUIRES: The Log must be closed.
   static Status DeleteOnDiskData(FsManager* fs_manager, const std::string& tablet_id);
 
+  // Removes the recovery directory and all files contained therein, if it exists.
+  // Intended to be invoked after log replay successfully completes.
+  static Status RemoveRecoveryDirIfExists(FsManager* fs_manager, const std::string& tablet_id);
+
   // Returns a reader that is able to read through the previous segments,
   // provided the log is initialized and not yet closed. After being closed,
   // this function will return NULL, but existing reader references will

http://git-wip-us.apache.org/repos/asf/kudu/blob/382ee120/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index 4b78e15..b55c1c0 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -44,6 +44,7 @@
 #include "kudu/consensus/log-test-base.h"
 #include "kudu/consensus/log.h"
 #include "kudu/consensus/log_util.h"
+#include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
 #include "kudu/fs/block_id.h"
 #include "kudu/fs/block_manager.h"
@@ -86,7 +87,6 @@ using std::vector;
 namespace kudu {
 
 using client::KuduClient;
-using client::KuduClientBuilder;
 using client::KuduInsert;
 using client::KuduSession;
 using client::KuduTable;
@@ -98,10 +98,13 @@ using clock::Clock;
 using clock::HybridClock;
 using consensus::ConsensusMetadata;
 using consensus::ConsensusMetadataManager;
+using consensus::ConsensusStatePB;
+using consensus::EXCLUDE_HEALTH_REPORT;
 using consensus::OpId;
 using consensus::RECEIVED_OPID;
 using fs::BlockManager;
 using itest::MiniClusterFsInspector;
+using kudu::itest::TServerDetails;
 using log::AppendNoOpsToLogSync;
 using log::Log;
 using log::LogOptions;
@@ -138,6 +141,99 @@ void TsRecoveryITest::StartClusterOneTs(vector<string> extra_tserver_flags,
   StartClusterWithOpts(opts);
 }
 
+// Regression test for KUDU-1038: This test replicates the scenario where :
+// 1. A log segment is deleted from one of the Tablet Replicas or is not fsynced before a
crash.
+// 2. On server restart, the replica with the deleted log segment enters a FAILED state after
+// being unable to complete replay of the WAL and leaves the WAL recovery directory in place.
+// 3. The master should tombstone the FAILED replica, causing its recovery directory to be
deleted.
+// A subsequent tablet copy and tablet bootstrap should cause the replica to become healthy
again.
+TEST_F(TsRecoveryITest, TestTabletRecoveryAfterSegmentDelete) {
+  // Start a cluster with 3 tablet servers consisting of 1 tablet with 3 replicas.
+  vector<string> flags;
+  // The log segment size and the number of log segments to retain is configured to be very
small
+  // that is done to be able to quickly fill up the log segments in order to corrupt them
+  // in a way that exercises the necessary code paths for this regression test.
+  flags.emplace_back("--log_segment_size_mb=1");
+  flags.emplace_back("--log_min_segments_to_retain=3");
+  NO_FATALS(StartCluster(flags));
+
+  const int kNumTablets = 1;
+  const int kNumTs = 3;
+  const int kTsIndex = 0; // Index of the tablet server we'll use for the test.
+  MonoDelta timeout = MonoDelta::FromSeconds(30);
+
+  // Create a new tablet.
+  TestWorkload write_workload(cluster_.get());
+  write_workload.set_payload_bytes(32 * 1024); // Write ops of size 32KB to quickly fill
the logs.
+  write_workload.set_num_tablets(kNumTablets);
+  write_workload.set_write_batch_size(1);
+  write_workload.Setup();
+
+  // Retrieve tablet id.
+  TServerDetails *ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()];
+  vector<tserver::ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets));
+  string tablet_id = tablets[0].tablet_status().tablet_id();
+
+  LOG(INFO) << "Starting workload...";
+
+  write_workload.Start();
+
+  LOG(INFO) << "Waiting for at least 4 files in WAL dir on tserver 0 for tablet "
+            << tablets[0].tablet_status().tablet_id() << "...";
+  ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(kTsIndex,
+            tablet_id, /* num wal segments + index file */ 4));
+
+  auto *ets = cluster_->tablet_server(0);
+
+  write_workload.StopAndJoin();
+
+  // Get the current consensus state.
+  ConsensusStatePB orig_cstate;
+  ASSERT_OK(GetConsensusState(ts, tablet_id, timeout, EXCLUDE_HEALTH_REPORT, &orig_cstate));
+
+  // Shutdown the cluster.
+  cluster_->Shutdown();
+
+  // Before restarting the cluster, delete a log segment
+  // from the first tablet replica.
+  {
+    FsManagerOpts opts;
+    opts.wal_root = ets->wal_dir();
+    opts.data_roots = ets->data_dirs();
+
+    gscoped_ptr<FsManager> fs_manager(new FsManager(env_, opts));
+
+    ASSERT_OK(fs_manager->Open());
+
+    string wal_dir = fs_manager->GetTabletWalDir(tablet_id);
+
+    // Delete one of the WAL segments so at tablet startup time we will
+    // detect out-of-order WAL segments during log replay and fail to bootstrap.
+    string segment = fs_manager->GetWalSegmentFileName(tablet_id, 2);
+
+    LOG(INFO) << "Deleting WAL segment: " << segment;
+    ASSERT_OK(fs_manager->env()->DeleteFile(segment));
+  }
+
+  ASSERT_OK(cluster_->Restart());
+
+  // The master will evict and then re-add the FAILED tablet replica.
+  for (unsigned i = 0; i < kNumTs; ++i) {
+    TServerDetails *ts_details = ts_map_[cluster_->tablet_server(i)->uuid()];
+    // Timeout atleast 60s to avoid test being flaky on TSAN mode.
+    ASSERT_OK(WaitUntilTabletRunning(ts_details, tablet_id,
+              MonoDelta::FromSeconds(60)));
+  }
+
+  // Ensure that the config changed since we started (evicted, re-added).
+  ConsensusStatePB new_cstate;
+  ASSERT_OK(GetConsensusState(ts, tablet_id, timeout, EXCLUDE_HEALTH_REPORT, &new_cstate));
+
+  ASSERT_GT(new_cstate.committed_config().opid_index(),
+            orig_cstate.committed_config().opid_index());
+}
+
 // Test for KUDU-2202 that ensures that blocks not found in the FS layer but
 // that are referenced by a tablet will not be reused.
 TEST_P(TsRecoveryITest, TestNoBlockIDReuseIfMissingBlocks) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/382ee120/src/kudu/tablet/tablet_bootstrap.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_bootstrap.cc b/src/kudu/tablet/tablet_bootstrap.cc
index 1dbafaf..58f9f54 100644
--- a/src/kudu/tablet/tablet_bootstrap.cc
+++ b/src/kudu/tablet/tablet_bootstrap.cc
@@ -62,7 +62,6 @@
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/human_readable.h"
 #include "kudu/gutil/strings/substitute.h"
-#include "kudu/gutil/walltime.h"
 #include "kudu/rpc/result_tracker.h"
 #include "kudu/rpc/rpc_header.pb.h"
 #include "kudu/tablet/metadata.pb.h"
@@ -93,10 +92,6 @@
 
 DECLARE_int32(group_commit_queue_size_bytes);
 
-DEFINE_bool(skip_remove_old_recovery_dir, false,
-            "Skip removing WAL recovery dir after startup. (useful for debugging)");
-TAG_FLAG(skip_remove_old_recovery_dir, hidden);
-
 DEFINE_double(fault_crash_during_log_replay, 0.0,
               "Fraction of the time when the tablet will crash immediately "
               "after processing a log entry during log replay. "
@@ -359,10 +354,6 @@ class TabletBootstrap {
   // with it.
   Status UpdateClock(uint64_t timestamp);
 
-  // Removes the recovery directory and all files contained therein.
-  // Intended to be invoked after log replay successfully completes.
-  Status RemoveRecoveryDir();
-
   // Return a log prefix string in the standard "T xxx P yyy" format.
   string LogPrefix() const;
 
@@ -604,7 +595,8 @@ Status TabletBootstrap::RunBootstrap(shared_ptr<Tablet>* rebuilt_tablet,
 
   RETURN_NOT_OK_PREPEND(PlaySegments(consensus_info), "Failed log replay. Reason");
 
-  RETURN_NOT_OK(RemoveRecoveryDir());
+  RETURN_NOT_OK(Log::RemoveRecoveryDirIfExists(tablet_->metadata()->fs_manager(),
+                                               tablet_->metadata()->tablet_id()))
   RETURN_NOT_OK(FinishBootstrap("Bootstrap complete.", rebuilt_log, rebuilt_tablet));
 
   return Status::OK();
@@ -720,33 +712,6 @@ Status TabletBootstrap::OpenLogReaderInRecoveryDir() {
   return Status::OK();
 }
 
-Status TabletBootstrap::RemoveRecoveryDir() {
-  FsManager* fs_manager = tablet_->metadata()->fs_manager();
-  string recovery_path = fs_manager->GetTabletWalRecoveryDir(tablet_->metadata()->tablet_id());
-  CHECK(fs_manager->Exists(recovery_path))
-      << "Tablet WAL recovery dir " << recovery_path << " does not exist.";
-
-  VLOG_WITH_PREFIX(1) << "Preparing to delete log recovery files and directory " <<
recovery_path;
-
-  string tmp_path = Substitute("$0-$1", recovery_path, GetCurrentTimeMicros());
-  VLOG_WITH_PREFIX(1) << "Renaming log recovery dir from "  << recovery_path
-                        << " to " << tmp_path;
-  RETURN_NOT_OK_PREPEND(fs_manager->env()->RenameFile(recovery_path, tmp_path),
-                        Substitute("Could not rename old recovery dir from: $0 to: $1",
-                                   recovery_path, tmp_path));
-
-  if (FLAGS_skip_remove_old_recovery_dir) {
-    LOG_WITH_PREFIX(INFO) << "--skip_remove_old_recovery_dir enabled. NOT deleting
" << tmp_path;
-    return Status::OK();
-  }
-  VLOG_WITH_PREFIX(1) << "Deleting all files from renamed log recovery directory "
<< tmp_path;
-  RETURN_NOT_OK_PREPEND(fs_manager->env()->DeleteRecursively(tmp_path),
-                        "Could not remove renamed recovery dir " + tmp_path);
-  VLOG_WITH_PREFIX(1) << "Completed deletion of old log recovery files and directory
"
-                        << tmp_path;
-  return Status::OK();
-}
-
 Status TabletBootstrap::OpenNewLog() {
   OpId init;
   init.set_term(0);

http://git-wip-us.apache.org/repos/asf/kudu/blob/382ee120/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index d55b0bb..fc5d8f6 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -1387,6 +1387,7 @@ Status TSTabletManager::DeleteTabletData(
   MAYBE_FAULT(FLAGS_fault_crash_after_blocks_deleted);
 
   CHECK_OK(Log::DeleteOnDiskData(meta->fs_manager(), tablet_id));
+  CHECK_OK(Log::RemoveRecoveryDirIfExists(meta->fs_manager(), tablet_id));
   MAYBE_FAULT(FLAGS_fault_crash_after_wal_deleted);
 
   // We do not delete the superblock or the consensus metadata when tombstoning


Mime
View raw message